Skip to content

Commit d7f38a9

Browse files
author
Hossein Asghari
committed
alignment acceptance based on not soft-clipped length
1 parent a787f5f commit d7f38a9

File tree

2 files changed

+47
-28
lines changed

2 files changed

+47
-28
lines changed

include/Util.hpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -1366,15 +1366,17 @@ Compile-time selection between list-like and map-like printing.
13661366

13671367
struct AlignmentResult {
13681368
AlignmentResult(bool isFwIn, int32_t scoreIn, std::string cigarIn, uint32_t openGapLenIn,
1369-
uint16_t softclip_start_in ) :
1370-
isFw(isFwIn), score(scoreIn), cigar(cigarIn), openGapLen(openGapLenIn), softclip_start(softclip_start_in) {}
1369+
uint16_t softclip_start_in, uint16_t softclip_end_in ) :
1370+
isFw(isFwIn), score(scoreIn), cigar(cigarIn), openGapLen(openGapLenIn), softclip_start(softclip_start_in),
1371+
softclip_end(softclip_end_in) {}
13711372

1372-
AlignmentResult() : isFw(true), score(0), cigar(""), openGapLen(0), softclip_start(0) {}
1373+
AlignmentResult() : isFw(true), score(0), cigar(""), openGapLen(0), softclip_start(0), softclip_end(0) {}
13731374
bool isFw;
13741375
int32_t score;
13751376
std::string cigar;
13761377
uint32_t openGapLen;
13771378
uint16_t softclip_start{0};
1379+
uint16_t softclip_end{0};
13781380
};
13791381

13801382
void joinReadsAndFilterSingle( pufferfish::util::CachedVectorMap<size_t, std::vector<pufferfish::util::MemCluster>, std::hash<size_t>>& leftMemClusters,

src/PuffAligner.cpp

+42-25
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
317317
auto& bandwidth = aligner.config().bandwidth;
318318
auto& aligner_config = aligner.config();
319319
libdivide::divider<int32_t> gapExtDivisor(static_cast<int32_t>(mopts.gapExtendPenalty));
320-
const int32_t minAcceptedScore = scoreStatus_.getCutoff(read.length()); //mopts.minScoreFraction * mopts.matchScore * readLen;
320+
const int32_t minAcceptedScore = scoreStatus_.getCutoff(read.length() - maxSoftclipLen); //mopts.minScoreFraction * mopts.matchScore * readLen;
321321
logger_->debug("\t\tNOTE mems.size(): {}, isRev: {}, minAcceptedScore: {}", mems.size(), !isFw, minAcceptedScore);
322322
// compute the maximum gap length that would be allowed given the length of read aligned so far and the current
323323
// alignment score.
@@ -476,6 +476,8 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
476476
fillRefSeqBufferReverse(allRefSeq, refAccPos, refWindowStart,
477477
refWindowLength, refSeqBuffer_);
478478

479+
int32_t numSoftClipped = 0;
480+
479481
if (refSeqBuffer_.length() > 0) {
480482
logger_->debug("\t\t\tCASE 1: some reference bases left to align");
481483
auto readWindow = readView.substr(0, firstMemStart_read).to_string();
@@ -578,49 +580,52 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
578580
// computeCIGAR ? addCigar(cigarGen, ez, true) : firstMemStart_read - num_soft_clipped;
579581

580582
decltype(alignmentScore) part_score = ez.max;
581-
int32_t oldRemainedSoftClipLen = remainedSoftClipLen;
582-
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", oldRemainedSoftClipLen, remainedSoftClipLen);
583-
remainedSoftClipLen -= readWindow.length() - (ez.max_q + 1);
584-
if(remainedSoftClipLen < 0 || ez.mqe + aligner_config.end_bonus > ez.max)
583+
numSoftClipped = readWindow.length() - (ez.max_q + 1);
584+
if(remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)
585585
{
586586
part_score = ez.mqe;
587587
// openGapLen = readWindow.length();
588588
openGapLen = ez.mqe_t + 1;
589+
numSoftClipped = 0;
589590
}
590591
else
591592
{
592593
part_score = ez.max;
593594
openGapLen = ez.max_t + 1;
594-
cigarGen.add_item(readWindow.length() - (ez.max_q + 1), 'S');
595-
595+
cigarGen.add_item(numSoftClipped, 'S');
596596
}
597+
remainedSoftClipLen -= numSoftClipped;
598+
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", remainedSoftClipLen + numSoftClipped, remainedSoftClipLen);
599+
597600
alignmentScore += part_score;
598601
addCigar(cigarGen, ez, true);
599602
// logger_->debug("score : {}", std::max(ez.mqe, ez.mte));
600603
} else { // EHSAN-TODO: check if this is necessary
601604
logger_->debug("\t\t\tCASE 2: no reference bases left (start)");
602605
// overhangingStart = true;
603606
// do any special soft clipping penalty here if we want
604-
remainedSoftClipLen -=
607+
numSoftClipped =
605608
allowSoftclip
606609
? firstMemStart_read
607610
: 0;
608611
alignmentScore +=
609-
allowSoftclip && remainedSoftClipLen >= 0
612+
allowSoftclip && remainedSoftClipLen >= numSoftClipped
610613
? 0
611614
: (-1 * mopts.gapOpenPenalty +
612615
-1 * mopts.gapExtendPenalty * firstMemStart_read);
613616
openGapLen = firstMemStart_read;
614617

615618
if (mopts.computeCIGAR) {
616-
if (allowSoftclip && remainedSoftClipLen >= 0) {
619+
if (allowSoftclip && remainedSoftClipLen >= numSoftClipped) {
617620
cigarGen.add_item(firstMemStart_read, 'S');
618621
openGapLen = 0;
619622
} else {
620623
cigarGen.add_item(firstMemStart_read, 'I');
624+
numSoftClipped = 0;
621625
}
622626
}
623627
}
628+
arOut.softclip_start = numSoftClipped;
624629
logger_->debug("\t\t\tscore_sofar: {}", alignmentScore);
625630
logger_->debug("\t\t\tcigar_sofar: {}", cigarGen.get_cigar());
626631
}
@@ -778,6 +783,8 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
778783
auto readWindow = readView.substr(prevMemEnd_read + 1).to_string();
779784
fillRefSeqBuffer(allRefSeq, refAccPos, refTailStart, refLen, refSeqBuffer_);
780785

786+
int32_t numSoftClipped = 0;
787+
781788
logger_->debug("\t\t\tgapRead : {}, refLen : {}, refBuffer_.size() : {}, refTotalLength : {}", gapRead, refLen, refSeqBuffer_.size(), refTotalLength);
782789
if (refLen > 0) {
783790
logger_->debug("\t\t\tCASE 1: some reference bases left to align");
@@ -850,20 +857,24 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
850857
// }
851858

852859
decltype(alignmentScore) part_score = ez.max;
853-
int32_t oldRemainedSoftClipLen = remainedSoftClipLen;
854-
remainedSoftClipLen -= readWindow.length() - (ez.max_q + 1);
855-
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", oldRemainedSoftClipLen, remainedSoftClipLen);
856-
if(remainedSoftClipLen < 0 || ez.mqe + aligner_config.end_bonus > ez.max)
860+
861+
numSoftClipped = readWindow.length() - (ez.max_q + 1);
862+
addCigar(cigarGen, ez, false);
863+
864+
if(remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)
857865
{
858866
part_score = ez.mqe;
867+
numSoftClipped = 0;
859868
}
860869
else
861870
{
862871
part_score = ez.max;
863-
cigarGen.add_item(readWindow.length() - (ez.max_q + 1), 'S');
872+
cigarGen.add_item(numSoftClipped, 'S');
864873
}
874+
remainedSoftClipLen -= numSoftClipped;
875+
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", remainedSoftClipLen + numSoftClipped, remainedSoftClipLen);
876+
865877
alignmentScore += part_score;
866-
addCigar(cigarGen, ez, false);
867878

868879
// NOTE: pre soft-clip code for adjusting the alignment score.
869880
// int32_t alnCost = allowOverhangSoftclip ? std::max(ez.mqe, ez.mte)
@@ -874,23 +885,25 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
874885
logger_->debug("\t\t\tCASE 2: no reference bases left (end)");
875886
//overhangingEnd = true;
876887
// do any special soft clipping penalty here if we want
877-
remainedSoftClipLen -=
888+
numSoftClipped =
878889
allowSoftclip
879890
? readWindow.length()
880891
: 0;
881892
alignmentScore +=
882-
allowSoftclip && remainedSoftClipLen >= 0
893+
allowSoftclip && remainedSoftClipLen >= numSoftClipped
883894
? 0
884895
: (-1 * mopts.gapOpenPenalty +
885896
-1 * mopts.gapExtendPenalty * readWindow.length());
886897
if (mopts.computeCIGAR) {
887-
if (allowSoftclip && remainedSoftClipLen >= 0) {
898+
if (allowSoftclip && remainedSoftClipLen >= numSoftClipped) {
888899
cigarGen.add_item(readWindow.length(), 'S');
889900
} else {
890901
cigarGen.add_item(readWindow.length(), 'I');
902+
numSoftClipped = 0;
891903
}
892904
}
893905
}
906+
arOut.softclip_end = numSoftClipped;
894907
logger_->debug("\t\t\tscore_sofar: {}", alignmentScore);
895908
logger_->debug("\t\t\tcigar_sofar: {}", cigarGen.get_cigar());
896909
}
@@ -962,14 +975,15 @@ int32_t PuffAligner::calculateAlignments(std::string& read_left, std::string& re
962975
alignRead(read_orphan, rc_orphan, jointHit.orphanClust()->mems, jointHit.orphanClust()->queryChainHash,
963976
jointHit.orphanClust()->perfectChain,
964977
jointHit.orphanClust()->isFw, tid, orphan_aln_cache, hctr, ar_orphan, verbose);
978+
auto orphan_total_softclip_len = ar_orphan.softclip_start + ar_orphan.softclip_end;
965979
jointHit.alignmentScore =
966-
ar_orphan.score > threshold(read_orphan.length()) ? ar_orphan.score : invalidScore;
980+
ar_orphan.score > threshold(read_orphan.length() - orphan_total_softclip_len) ? ar_orphan.score : invalidScore;
967981
jointHit.orphanClust()->cigar = (mopts.computeCIGAR) ? ar_orphan.cigar : "";
968982
jointHit.orphanClust()->openGapLen = ar_orphan.openGapLen;
969983
jointHit.orphanClust()->softClipStart = ar_orphan.softclip_start;
970984
// jointHit.orphanClust()->coverage = jointHit.alignmentScore;
971985
if (jointHit.alignmentScore < 0 and verbose) {
972-
std::cerr << read_orphan.length() << " " << threshold(read_orphan.length()) << " " << ar_left.score << "\n";
986+
std::cerr << read_orphan.length() << " " << threshold(read_orphan.length() - orphan_total_softclip_len) << " " << ar_left.score << "\n";
973987
}
974988
return jointHit.alignmentScore;
975989
} else {
@@ -982,8 +996,10 @@ int32_t PuffAligner::calculateAlignments(std::string& read_left, std::string& re
982996
alignRead(read_right, read_right_rc_, jointHit.rightClust->mems, jointHit.rightClust->queryChainHash, jointHit.rightClust->perfectChain,
983997
jointHit.rightClust->isFw, tid, alnCacheRight, hctr, ar_right, verbose);
984998

985-
jointHit.alignmentScore = ar_left.score > threshold(read_left.length()) ? ar_left.score : invalidScore;
986-
jointHit.mateAlignmentScore = ar_right.score > threshold(read_right.length()) ? ar_right.score : invalidScore;
999+
auto left_total_softclip_len = ar_left.softclip_start + ar_left.softclip_end;
1000+
jointHit.alignmentScore = ar_left.score > threshold(read_left.length() - left_total_softclip_len) ? ar_left.score : invalidScore;
1001+
auto right_total_softclip_len = ar_right.softclip_start + ar_right.softclip_end;
1002+
jointHit.mateAlignmentScore = ar_right.score > threshold(read_right.length() - right_total_softclip_len) ? ar_right.score : invalidScore;
9871003
/*
9881004
jointHit.alignmentScore = (score_left == invalidScore or score_right == invalidScore)?
9891005
invalidScore : score_left + score_right;
@@ -1031,13 +1047,14 @@ int32_t PuffAligner::calculateAlignments(std::string& read, pufferfish::util::Jo
10311047
ar_left.score = invalidScore;
10321048
const auto& oc = jointHit.orphanClust();
10331049
alignRead(read, read_left_rc_, oc->mems, oc->queryChainHash, oc->perfectChain, oc->isFw, tid, alnCacheLeft, hctr, ar_left, verbose);
1050+
auto total_softclip_len = ar_left.softclip_start + ar_left.softclip_end;
10341051
jointHit.alignmentScore =
1035-
ar_left.score > threshold(read.length()) ? ar_left.score : invalidScore;
1052+
ar_left.score > threshold(read.length() - total_softclip_len) ? ar_left.score : invalidScore;
10361053
jointHit.orphanClust()->cigar = (mopts.computeCIGAR) ? ar_left.cigar : "";
10371054
jointHit.orphanClust()->openGapLen = ar_left.openGapLen;
10381055
// jointHit.orphanClust()->coverage = jointHit.alignmentScore;
10391056
if (jointHit.alignmentScore < 0 and verbose) {
1040-
std::cerr << read.length() << " " << threshold(read.length()) << " " << ar_left.score << "\n";
1057+
std::cerr << read.length() << " " << threshold(read.length() - total_softclip_len) << " " << ar_left.score << "\n";
10411058
}
10421059
return jointHit.alignmentScore;
10431060
}

0 commit comments

Comments
 (0)