@@ -281,8 +281,8 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
281
281
bool invalidStart = (signedRefStartPos < 0 );
282
282
bool invalidEnd = (signedRefEndPos > refTotalLength);
283
283
284
- uint32_t maxSoftclipLenGeneral = mopts.maxSoftclipFractionGeneral * readLen;
285
- uint32_t maxSoftclipLenOverhang = mopts.maxSoftclipFractionOverhang * readLen;
284
+ int32_t maxSoftclipLenGeneral = mopts.maxSoftclipFractionGeneral * readLen;
285
+ int32_t maxSoftclipLenOverhang = mopts.maxSoftclipFractionOverhang * readLen;
286
286
287
287
if (mopts.end2end ) {
288
288
maxSoftclipLenGeneral = 0 ;
@@ -484,11 +484,14 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
484
484
logger_->debug (" \t\t\t read: [{}]" , readWindow);
485
485
logger_->debug (" \t\t\t ref: [{}]" , refSeqBuffer_);
486
486
487
+ bool isOverhang = (readStartPosOnRef < 0 );
488
+ auto remainedSoftClipLen = isOverhang ? remainedSoftClipLenOverhang : remainedSoftClipLenGeneral;
489
+
487
490
bandwidth = maxGaps;
488
491
logger_->debug (" \t\t\t ksw2_parameters: bandwidth={}, end_bonus={}, zdrop={}" , bandwidth, aligner_config.end_bonus , aligner_config.dropoff );
489
492
auto cutoff = minAcceptedScore - mopts.matchScore * read .length ();
490
493
aligner (readWindow.data (), readWindow.length (), refSeqBuffer_.data (),
491
- refSeqBuffer_.length (), &ez, cutoff, remainedSoftClipLenGeneral ,
494
+ refSeqBuffer_.length (), &ez, cutoff, remainedSoftClipLen ,
492
495
ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
493
496
logger_->debug (" \t\t\t ksw2_results:" );
494
497
logger_->debug (" \t\t\t\t max={}, max_q={}, max_t={}" , ez.max , ez.max_q , ez.max_t );
@@ -521,8 +524,10 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
521
524
// https://github.com/COMBINE-lab/salmon/issues/475).
522
525
523
526
decltype (alignmentScore) part_score = ez.max ;
527
+
524
528
numSoftClipped = readWindow.length () - (ez.max_q + 1 );
525
- if (remainedSoftClipLenGeneral < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max )
529
+
530
+ if (remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max )
526
531
{
527
532
part_score = ez.mqe ;
528
533
openGapLen = ez.mqe_t + 1 ;
@@ -534,9 +539,6 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
534
539
openGapLen = ez.max_t + 1 ;
535
540
cigarGen.add_item (numSoftClipped, ' S' );
536
541
}
537
- remainedSoftClipLenGeneral -= numSoftClipped;
538
- remainedSoftClipLenOverhang -= numSoftClipped;
539
- logger_->debug (" \t\t\t\t remainedSoftClipLen={}->{};" , remainedSoftClipLenGeneral + numSoftClipped, remainedSoftClipLenGeneral);
540
542
541
543
alignmentScore += part_score;
542
544
addCigar (cigarGen, ez, true );
@@ -567,7 +569,16 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
567
569
cigarGen.add_item (firstMemStart_read, cigar_char);
568
570
}
569
571
}
572
+
573
+ // both of the thresholds are updated in case the 5'-end was overhang and later the 3'-end is going to be
574
+ // generally soft-clipped. this would be a complex scenario and we check the general threshold for both sides
575
+ // in this case to avoid having soft-clipped lengths that are no allowed
576
+ remainedSoftClipLenGeneral -= numSoftClipped;
577
+ remainedSoftClipLenOverhang -= numSoftClipped;
578
+ logger_->debug (" \t\t\t #soft-clipped={}" , numSoftClipped);
579
+
570
580
arOut.softclip_start = numSoftClipped;
581
+
571
582
logger_->debug (" \t\t\t score_sofar: {}" , alignmentScore);
572
583
logger_->debug (" \t\t\t cigar_sofar: {}" , cigarGen.get_cigar ());
573
584
}
@@ -729,11 +740,21 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
729
740
logger_->debug (" \t\t\t CASE 1: some reference bases left to align" );
730
741
logger_->debug (" \t\t\t read: [{}]" , readWindow);
731
742
logger_->debug (" \t\t\t ref: [{}]" , refSeqBuffer_);
743
+
744
+ bool isOverhang = (gapRead > refLen);
745
+
746
+ // if the 5'-end was soft-clip generally (not in overhanging), we will use the general threshold regardless
747
+ // that's because in complex General-Overhang or Overhang-General scenarios, we do not want to allow soft-cliping
748
+ // above the specified limit so all the soft-clips in these scenarios are considered general
749
+ // the code will automatically consider aligning to the end of 3'-end here as well if the limit is reached
750
+ auto remainedSoftClipLen = (isOverhang && (remainedSoftClipLenGeneral == maxSoftclipLenGeneral))
751
+ ? remainedSoftClipLenOverhang
752
+ : remainedSoftClipLenGeneral;
732
753
733
754
logger_->debug (" \t\t\t ksw2_parameters: bandwidth={}, end_bonus={}, zdrop={}" , bandwidth, aligner_config.end_bonus , aligner_config.dropoff );
734
755
auto cutoff = minAcceptedScore - alignmentScore - mopts.matchScore * readWindow.length ();
735
756
aligner (readWindow.data (), readWindow.length (), refSeqBuffer_.data (),
736
- refLen, &ez, cutoff, remainedSoftClipLenGeneral ,
757
+ refLen, &ez, cutoff, remainedSoftClipLen ,
737
758
ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
738
759
logger_->debug (" \t\t\t ksw2_results:" );
739
760
logger_->debug (" \t\t\t\t max={}, max_q={}, max_t={}" , ez.max , ez.max_q , ez.max_t );
@@ -750,11 +771,11 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
750
771
if (ez.mqe != KSW_NEG_INF) ez.stopped = 0 ;
751
772
752
773
decltype (alignmentScore) part_score = ez.max ;
753
-
754
- numSoftClipped = readWindow.length () - (ez.max_q + 1 );
755
774
addCigar (cigarGen, ez, false );
756
775
757
- if (remainedSoftClipLenGeneral < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max )
776
+ numSoftClipped = readWindow.length () - (ez.max_q + 1 );
777
+
778
+ if (remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max )
758
779
{
759
780
part_score = ez.mqe ;
760
781
numSoftClipped = 0 ;
@@ -764,10 +785,7 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
764
785
part_score = ez.max ;
765
786
cigarGen.add_item (numSoftClipped, ' S' );
766
787
}
767
- remainedSoftClipLenGeneral -= numSoftClipped;
768
- remainedSoftClipLenOverhang -= numSoftClipped;
769
- logger_->debug (" \t\t\t\t remainedSoftClipLen={}->{};" , remainedSoftClipLenGeneral + numSoftClipped, remainedSoftClipLenGeneral);
770
-
788
+
771
789
alignmentScore += part_score;
772
790
773
791
// NOTE: pre soft-clip code for adjusting the alignment score.
@@ -783,14 +801,20 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
783
801
mopts.allowSoftclip
784
802
? readWindow.length ()
785
803
: 0 ;
804
+
805
+ // again, general threshold will be used if 5'-end was softclip in general case and not overhang
806
+ auto remainedSoftClipLen = (remainedSoftClipLenGeneral == maxSoftclipLenGeneral)
807
+ ? remainedSoftClipLenOverhang
808
+ : remainedSoftClipLenGeneral;
809
+
786
810
alignmentScore +=
787
- mopts.allowSoftclip && remainedSoftClipLenOverhang >= numSoftClipped
811
+ mopts.allowSoftclip && remainedSoftClipLen >= numSoftClipped
788
812
? 0
789
813
: (-1 * mopts.gapOpenPenalty +
790
814
-1 * mopts.gapExtendPenalty * readWindow.length ());
791
815
792
816
char cigar_char;
793
- if (mopts.allowSoftclip && remainedSoftClipLenOverhang >= numSoftClipped) {
817
+ if (mopts.allowSoftclip && remainedSoftClipLen >= numSoftClipped) {
794
818
cigar_char = ' S' ;
795
819
} else {
796
820
cigar_char = ' I' ;
@@ -801,6 +825,10 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
801
825
cigarGen.add_item (readWindow.length (), cigar_char);
802
826
}
803
827
}
828
+ remainedSoftClipLenGeneral -= numSoftClipped;
829
+ remainedSoftClipLenOverhang -= numSoftClipped;
830
+ logger_->debug (" \t\t\t #soft-clipped={}" , numSoftClipped);
831
+
804
832
arOut.softclip_end = numSoftClipped;
805
833
logger_->debug (" \t\t\t score_sofar: {}" , alignmentScore);
806
834
logger_->debug (" \t\t\t cigar_sofar: {}" , cigarGen.get_cigar ());
0 commit comments