From c37268ca2cc2464ec6f739b8c42eb5ff630e2b58 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Feb 2023 16:21:53 +0000 Subject: [PATCH 01/70] Switch to CURLINFO_CONTENT_LENGTH_DOWNLOAD_T for newer libcurl The older CURLINFO_CONTENT_LENGTH_DOWNLOAD has been deprecated, and is now causing warnings to be printed on distributions that have moved on to the latest libcurl versions. CURLINFO_CONTENT_LENGTH_DOWNLOAD_T was officially added in curl 7.55.0. --- hfile_libcurl.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 2de7ccbd9..1e4a4486f 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1330,13 +1330,20 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) } if (mode == 'r') { +#if LIBCURL_VERSION_NUM >= 0x073700 // 7.55.0 + curl_off_t offset; + + if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, + &offset) == CURLE_OK && offset > 0) + fp->file_size = (off_t) offset; +#else double dval; if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &dval) == CURLE_OK && dval >= 0.0) fp->file_size = (off_t) (dval + 0.1); +#endif } - fp->base.backend = &libcurl_backend; return &fp->base; From 05ef3b73ee1fa2efc2e61a71b3c7f91e75af08f8 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Mar 2023 14:56:04 +0000 Subject: [PATCH 02/70] Check if VCF POS column could be fully parsed and throw an error if not. The position too large error report is also fixed so that it reports the incorrect value instead of an empty string. Resolves #1570 Co-authored-by: Rob Davies --- vcf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 59d433c19..d9b0826b1 100644 --- a/vcf.c +++ b/vcf.c @@ -3232,9 +3232,13 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) v->rid = kh_val(d, k).id; } else if (i == 1) { // POS overflow = 0; + char *tmp = p; v->pos = hts_str2uint(p, &p, 63, &overflow); if (overflow) { - hts_log_error("Position value '%s' is too large", p); + hts_log_error("Position value '%s' is too large", tmp); + goto err; + } else if ( *p ) { + hts_log_error("Could not parse the position '%s'", tmp); goto err; } else { v->pos -= 1; From fda1e0375d26723983a264e4f12bcecdec5aa4a6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 28 Feb 2023 13:53:29 +0000 Subject: [PATCH 03/70] Fix cram_index_query_last function The cram_index_query_last used by sam_itr_regarray had problems when dealing with slices whose region is contained within another region. Fundamentally this is due to the cram_index arrays being contiguous in memory until a containment is found, at which point the pointers will be to an entirely different array. This breaks naive pointer comparisons. The cram_index struct already had a "next" field holding the file offset of the next container. This has been replaced by e_next pointing to the next cram_entry struct in file ordering, and e_next->offset is equivalent to the old "next". This allows consumption of the index either as the original nested containment list or as a traditional linked list. Also fixed cram_index_query with from != NULL, which similarly was incorrect before. We never used this function and it's not public, but we now use it within the rewrite of cram_index_query_last. Fixes #1569 --- cram/cram_index.c | 115 +++++++++++++++++++++++++++----------------- cram/cram_structs.h | 5 +- hts.c | 6 +-- 3 files changed, 76 insertions(+), 50 deletions(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 45d420df2..601852d87 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -72,7 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dump_index_(cram_index *e, int level) { int i, n; n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); - printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); + printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next); for (i = 0; i < e->nslice; i++) { dump_index_(&e->e[i], level+1); } @@ -86,6 +86,37 @@ static void dump_index(cram_fd *fd) { } #endif +// Thread a linked list through the nested containment list. +// This makes navigating it and finding the "next" index entry +// trivial. +static cram_index *link_index_(cram_index *e, cram_index *e_last) { + int i; + if (e_last) + e_last->e_next = e; + + // We don't want to link in the top-level cram_index with + // offset=0 and start/end = INT_MIN/INT_MAX. + if (e->offset) + e_last = e; + + for (i = 0; i < e->nslice; i++) + e_last = link_index_(&e->e[i], e_last); + + return e_last; +} + +static void link_index(cram_fd *fd) { + int i; + cram_index *e_last = NULL; + + for (i = 0; i < fd->index_sz; i++) { + e_last = link_index_(&fd->index[i], e_last); + } + + if (e_last) + e_last->e_next = NULL; +} + static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { int sign = 1; int32_t val = 0; @@ -313,7 +344,10 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { free(kstr.s); free(tfn_idx); - // dump_index(fd); + // Convert NCList to linear linked list + link_index(fd); + + //dump_index(fd); return 0; @@ -356,7 +390,7 @@ void cram_index_free(cram_fd *fd) { * entries, but we require at least one per reference.) * * If the index finds multiple slices overlapping this position we - * return the first one only. Subsequent calls should specifying + * return the first one only. Subsequent calls should specify * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * @@ -371,6 +405,17 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, int i, j, k; cram_index *e; + if (from) { + // Continue from a previous search. + // We switch to just scanning the linked list, as the nested + // lists are typically short. + e = from->e_next; + if (e && e->refid == refid && e->start <= pos) + return e; + else + return NULL; + } + switch(refid) { case HTS_IDX_NONE: case HTS_IDX_REST: @@ -400,8 +445,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, return NULL; } - if (!from) - from = &fd->index[refid+1]; + from = &fd->index[refid+1]; // Ref with nothing aligned against it. if (!from->e) @@ -469,52 +513,33 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { return &from->e[slice]; } +/* + * Find the last container overlapping pos 'end', and the file offset of + * its end (equivalent to the start offset of the container following it). + */ cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { - cram_index *first = cram_index_query(fd, refid, end, NULL); - cram_index *last = cram_index_last(fd, refid, NULL); - if (!first || !last) - return NULL; - - while (first < last && (first+1)->start <= end) - first++; + cram_index *e = NULL, *prev_e; + do { + prev_e = e; + e = cram_index_query(fd, refid, end, prev_e); + } while (e); - while (first->e) { - int count = 0; - int nslices = first->nslice; - first = first->e; - while (++count < nslices && (first+1)->start <= end) - first++; - } + if (!prev_e) + return NULL; + e = prev_e; - // Compute the start location of next container. - // - // This is useful for stitching containers together in the multi-region - // iterator. Sadly we can't compute this from the single index line. + // Note: offset of e and e->e_next may be the same if we're using a + // multi-ref container where a single container generates multiple + // index entries. // - // Note we can have neighbouring index entries at the same location - // for when we have multi-reference mode and/or multiple slices per - // container. - cram_index *next = first; + // We need to keep iterating until offset differs in order to find + // the genuine file offset for the end of container. do { - if (next >= last) { - // Next non-empty reference - while (++refid+1 < fd->index_sz) - if (fd->index[refid+1].nslice) - break; - if (refid+1 >= fd->index_sz) { - next = NULL; - } else { - next = fd->index[refid+1].e; - last = fd->index[refid+1].e + fd->index[refid+1].nslice; - } - } else { - next++; - } - } while (next && next->offset == first->offset); - - first->next = next ? next->offset : 0; + prev_e = e; + e = e->e_next; + } while (e && e->offset == prev_e->offset); - return first; + return prev_e; } /* diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 1ee4b9e85..0a66d51b9 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -725,7 +725,10 @@ typedef struct cram_index { int slice; // 1.0 landmark index, 1.1 landmark value int len; // 1.1 - size of slice in bytes int64_t offset; // 1.0 1.1 - int64_t next; // derived: offset of next container. + + // Linked list of cram_index entries. Used to convert recursive + // NCList back to a linear list. + struct cram_index *e_next; } cram_index; typedef struct { diff --git a/hts.c b/hts.c index cead9d537..c122ce23a 100644 --- a/hts.c +++ b/hts.c @@ -3408,14 +3408,12 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) } if (e) { - off[n_off++].v = e->next - ? e->next + off[n_off++].v = e->e_next + ? e->e_next->offset : e->offset + e->slice + e->len; } else { hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } - } else { - hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { From f24ad2cd62b7a083056266e4bff580891ea61fcd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 6 Mar 2023 10:08:38 +0000 Subject: [PATCH 04/70] Avoid deeply nested containment list on old CRAM indices. Unmapped data on ancient .crai files contained an end offset that wasn't 0. This escaped the deep recursion avoidance for unmapped data when building the NC list. --- cram/cram_index.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 601852d87..846dc709d 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -306,7 +306,8 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[(idx_stack_ptr = 0)] = idx; } - while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) { + while (!(e.start >= idx->start && e.end <= idx->end) || + (idx->start == 0 && idx->refid == -1)) { idx = idx_stack[--idx_stack_ptr]; } From be6633ae347043097902898cf529fff7144949af Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 1 Mar 2023 16:34:03 +0000 Subject: [PATCH 05/70] Fix crypt4gh redirection hopen() uses a ':' in `mode` to indicate that there are extra parameters. hts_crypt4gh_redirect() needs to add this so that the "parent" parameter is picked up by the crypt4gh plug-in. Failing to do this caused it to re-open the file instead of reusing the existing file handle - which worked for regular files but not for things like pipes or htsget. --- hts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hts.c b/hts.c index c122ce23a..86b5bb877 100644 --- a/hts.c +++ b/hts.c @@ -1349,6 +1349,7 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, hFILE *hfile1 = *hfile_ptr; hFILE *hfile2 = NULL; char fn_buf[512], *fn2 = fn_buf; + char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen() const char *prefix = "crypt4gh:"; size_t fn2_len = strlen(prefix) + strlen(fn) + 1; int ret = -1; @@ -1362,7 +1363,8 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, // Reopen fn using the crypt4gh plug-in (if available) snprintf(fn2, fn2_len, "%s%s", prefix, fn); - hfile2 = hopen(fn2, mode, "parent", hfile1, NULL); + snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":"); + hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL); if (hfile2) { // Replace original hfile with the new one. The original is now // enclosed within hfile2 From dcd20d9fabfac4398af6c49944dc1c99e0abb6e0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 2 Mar 2023 10:14:30 +0000 Subject: [PATCH 06/70] Permit fastq output to create empty FASTQ records for seq "*". This is rather questionable, but htslib can now output empty SAM records as empty fastq records. Eg: name 4 * 0 0 * * 0 0 * * becomes @name + Htslib is happy to read this back in and produces the original SAM once more. Bwa mem and minimap2 can both read these fastq entries too, although the SAM output is bugged as they output an empty field instead of "*" for SEQ. Potential reasons for accepting this: - When dealing with paired data, we don't want to output a differing number of records from samtools fastq if read1 has seq and read2 has "*". Note as this filtered at the htslib layer, it's not considered as a singleton so fastq -s won't rescue this. - At least some aligners apparently support this format. Although inevitably they just produce unmapped data. - Arguably this is a case of silly input => silly output! - Users can manually elect to "samtools view -e 'length(seq) > 0'" before using samtools fastq, which then fixes the not-a-singleton problem. - It converts samtools fastq output back to how it was in pre 1.13 era, where we rewrote it to use htslib's interfaces. Potential reason to reject: - It may yield output which trips up some poorly written tools. Fixes samtools/samtools#1799 --- sam.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sam.c b/sam.c index e2e539b2d..3430a5895 100644 --- a/sam.c +++ b/sam.c @@ -4257,8 +4257,6 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) str->l = 0; - if (len == 0) return 0; - // Name if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) return -1; From 839a2e94647e5a4f7d59967235e42f157ff038d2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 7 Mar 2023 15:35:15 +0000 Subject: [PATCH 07/70] Fix a couple small VCF auto-indexing bugs. 1. sam_idx_save wasn't validating the file is BGZF. It's invalid usage to try calling this function on uncompressed data, but we should double check. Note this is triggered by a bcftools bug where -o foo.vcf.gz##idx##foo.vcf.gz.csi writes VCF rather than VCF.gz as the "filename" doesn't end in .gz. 2. Add the hts_idx_amend_last calls to vcf_write as we did previously for SAM/BAM. This isn't technically a requirement, as all it's doing is changing virtual offsets to an alternate form that gives the same file offset (see comments above hts_idx_amend_last), but doing so means the auto-build indices match those produced by a standalone index command. This fix isn't complete as it hasn't been worked on for BCF yet. However it comes under the "nicety" category and isn't really fixing a bug so we can try to figure out how to tidy up BCF later (plus VCF.gz is basically the universal format). --- sam.c | 2 +- vcf.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 3430a5895..a93664d87 100644 --- a/sam.c +++ b/sam.c @@ -1067,7 +1067,7 @@ int sam_idx_save(htsFile *fp) { errno = -ret; return -1; } - if (bgzf_flush(fp->fp.bgzf) < 0) + if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) return -1; hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); diff --git a/vcf.c b/vcf.c index d9b0826b1..392d9c932 100644 --- a/vcf.c +++ b/vcf.c @@ -3574,6 +3574,8 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) if ( fp->format.compression!=no_compression ) { if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) return -1; + if (fp->idx) + hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); From 19cd41cb9e39b0bbfec759a530849136a8801adc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 8 Mar 2023 09:30:17 +0000 Subject: [PATCH 08/70] Backport attractivechaos/klib#78 to htslib. Original PR by Pall Melsted, with only manual merging and one trivial bug fix by myself. Co-authored-by: Pall Melsted --- htslib/kseq.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/htslib/kseq.h b/htslib/kseq.h index 3e2404568..5d573d3d9 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -24,8 +24,6 @@ SOFTWARE. */ -/* Last Modified: 05MAR2012 */ - #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -57,6 +55,7 @@ unsigned char *buf; \ } kstream_t; +#define ks_err(ks) ((ks)->end == -1) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) @@ -78,11 +77,13 @@ #define __KS_INLINED(__read) \ static inline klib_unused int ks_getc(kstream_t *ks) \ { \ + if (ks_err(ks)) return -3; \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } \ ks->seek_pos++; \ return (int)ks->buf[ks->begin++]; \ @@ -99,11 +100,13 @@ uint64_t seek_pos = str->l; \ for (;;) { \ int i; \ + if (ks_err(ks)) return -3; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ @@ -180,25 +183,27 @@ >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string + -3 error reading stream + -4 overflow error */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c,r; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ @@ -207,7 +212,7 @@ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - if (seq->seq.l + 1 >= seq->seq.m) return -3; /* error: adjusting m overflowed */ \ + if (seq->seq.l + 1 >= seq->seq.m) return -4; /* error: adjusting m overflowed */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ @@ -216,9 +221,10 @@ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1)) >= 0 && seq->qual.l < seq->seq.l); \ + if (c == -3) return -3; /* stream error */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ From 46bcc366152ce20b899572433b55a085f9a54ad2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Mar 2023 14:42:26 +0000 Subject: [PATCH 09/70] Slightly speed up various cram decoding functions (#1580) None of this is huge, but it all adds up. - bam_set1 has been refactored so -O3 is more likely to do unrolling and vectorisation. // Old time inst cyc // gcc -O2 12.36 78936832183 36853852204 // gcc -O3 12.37 78713347525 36867027825 // clang13 -O2 12.43 77451926728 37012866717 // clang13 -O3 12.32 77627221907 36691623424 // gcc12 -O2 12.43 78895089091 37081260172 // gcc12 -O3 12.36 78505904437 36829216967 // New // gcc -O2 12.47 78832021505 37200597109 + // gcc -O3 12.14 76499369401 36390334338 -- // clang13 -O2 12.38 76678460761 36920111561 ~ // clang13 -O3 12.26 76678023071 36548488492 ~ // gcc12 -O2 12.38 78581694397 36880034181 - // gcc12 -O3 12.15 76356625541 36293921439 -- - Improve the MD/NM generation in CRAM decoding. With decode_md=1 (default) by decode changed from 12.91s to 12.57s With decode_md=0 it's 11.92, so that's 1/3rd of the overhead removed. - Changed the block_resize to resize in slightly smaller chunks and to use integer maths. - Reduce excessive pointer redirection in cram_decode_seq. Unsure if this speeds things up much (sometimes it seems to), but it provides tidier code too. Comparisons with Dev(/D) and this commit (/4) on Revio (re/) and NovaSeq (nv/) with a variety of compilers and optimisations. Figures are cycle counts from perf stat Xeon E5-2660 Xeon Gold 6142 re/D gcc12-O2 85699982958 74752510144 re/4 gcc12-O2 82265084038 71947558666 -3.7/3.7 re/D gcc12-O3 85837077212 74392223354 re/4 gcc12-O3 82024293685 71861154116 -4.4/3.4 re/D clang12-3 85608876213 73934329619 re/4 clang12-3 84390364926 73961392095 -1.4/0 re/D clang12-2 86861787827 74255338533 re/4 clang12-2 83186843797 72421845542 -4.2/2.5; better than O3 nv/D gcc12-O2 36694089398 31444641828 nv/4 gcc12-O2 34949122875 30061074125 -4.8/-4.4 nv/D gcc12-O3 36528573980 30792932748 nv/4 gcc12-O3 35069572111 30066058127 -4.0/2.4 nv/D clang12-3 37906764004 32459168883 nv/4 clang12-3 36344679534 30786987972 -4.1/-5.2 nv/D clang12-2 38443827308 32304948037 nv/4 clang12-2 36361384580 31022553379 -5.4/-4.0 Benchmarks on 10 million NovaSeq records, showing billions of cycles as more robust than CPU time. EPYC 7543 before after gcc(7) -O2 28.6 28.3 -1.0 gcc12 -O2 28.2 28.3 +0.4 clang7 -O2 30.2 28.2 -6.6 clang13 -O2 29.9 28.2 -5.7 gcc(7) -O3 28.7 28.2 -1.7 gcc12 -O3 28.0 27.2 -2.9 clang7 -O3 30.1 28.3 -6.0 clang13 -O3 29.7 28.3 -4.7 Xeon Gold 6142 before after gcc(7) -O2 32.8 30.5 -7.0 gcc12 -O2 31.8 30.1 -5.3 clang7 -O2 33.1 29.9 -9.7 clang13 -O2 34.1 30.8 -9.7 gcc(7) -O3 32.7 30.2 -7.6 gcc12 -O3 31.6 29.1 -7.9 clang7 -O3 34.3 30.0 -12.5 clang13 -O3 33.3 30.9 -7.2 --- cram/cram_decode.c | 245 ++++++++++++++++++++++----------------------- cram/cram_io.h | 2 +- sam.c | 14 ++- 3 files changed, 134 insertions(+), 127 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 73f567106..39869cbdd 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1118,6 +1118,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t ds = s->data_series; sam_hrecs_t *bfd = sh->hrecs; + cram_codec **codecs = c->comp_hdr->codecs; + if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { memset(qual, 255, cr->len); } @@ -1132,9 +1134,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_FN) { - if (!c->comp_hdr->codecs[DS_FN]) return -1; - r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN], - blk, (char *)&fn, &out_sz); + if (!codecs[DS_FN]) return -1; + r |= codecs[DS_FN]->decode(s,codecs[DS_FN], + blk, (char *)&fn, &out_sz); if (r) return r; } else { fn = 0; @@ -1146,6 +1148,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (!(ds & (CRAM_FC | CRAM_FP))) goto skip_cigar; + if (fn) { + if ((ds & CRAM_FC) && !codecs[DS_FC]) + return -1; + if ((ds & CRAM_FP) && !codecs[DS_FP]) + return -1; + } + for (f = 0; f < fn; f++) { int32_t pos = 0; char op; @@ -1158,22 +1167,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_FC) { - if (!c->comp_hdr->codecs[DS_FC]) return -1; - r |= c->comp_hdr->codecs[DS_FC]->decode(s, - c->comp_hdr->codecs[DS_FC], - blk, - &op, &out_sz); + r |= codecs[DS_FC]->decode(s, + codecs[DS_FC], + blk, + &op, &out_sz); if (r) return r; } if (!(ds & CRAM_FP)) continue; - if (!c->comp_hdr->codecs[DS_FP]) return -1; - r |= c->comp_hdr->codecs[DS_FP]->decode(s, - c->comp_hdr->codecs[DS_FP], - blk, - (char *)&pos, &out_sz); + r |= codecs[DS_FP]->decode(s, + codecs[DS_FP], + blk, + (char *)&pos, &out_sz); if (r) return r; pos += prev_pos; @@ -1214,26 +1221,33 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // 'N' in both ref and seq is also mismatch for NM/MD if (ref_pos + pos-seq_pos > s->ref_end) goto beyond_slice; + + const char *refp = s->ref + ref_pos - s->ref_start + 1; + const int frag_len = pos - seq_pos; + int do_cpy = 1; if (decode_md || decode_nm) { - int i; - for (i = 0; i < pos - seq_pos; i++) { - // FIXME: not N, but nt16 lookup == 15? - char base = s->ref[ref_pos - s->ref_start + 1 + i]; - if (base == 'N') { - if (add_md_char(s, decode_md, - s->ref[ref_pos - s->ref_start + 1 + i], - &md_dist) < 0) - return -1; - nm++; - } else { - md_dist++; + char *N = memchr(refp, 'N', frag_len); + if (N) { + int i; + for (i = 0; i < frag_len; i++) { + char base = refp[i]; + if (base == 'N') { + if (add_md_char(s, decode_md, + 'N', &md_dist) < 0) + return -1; + nm++; + } else { + md_dist++; + } + seq[seq_pos-1+i] = base; } - seq[seq_pos-1+i] = base; + do_cpy = 0; + } else { + md_dist += frag_len; } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - pos - seq_pos); } + if (do_cpy) + memcpy(&seq[seq_pos-1], refp, frag_len); } } #ifdef USE_X @@ -1271,12 +1285,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, switch (CRAM_MAJOR_VERS(fd->version)) { case 1: if (ds & CRAM_IN) { - r |= c->comp_hdr->codecs[DS_IN] - ? c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_IN] + ? codecs[DS_IN]->decode(s, codecs[DS_IN], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } @@ -1284,22 +1297,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 2: default: if (ds & CRAM_SC) { - r |= c->comp_hdr->codecs[DS_SC] - ? c->comp_hdr->codecs[DS_SC] - ->decode(s, c->comp_hdr->codecs[DS_SC], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_SC] + ? codecs[DS_SC]->decode(s, codecs[DS_SC], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } break; //default: - // r |= c->comp_hdr->codecs[DS_BB] - // ? c->comp_hdr->codecs[DS_BB] - // ->decode(s, c->comp_hdr->codecs[DS_BB], - // blk, &seq[pos-1], &out_sz2) + // r |= codecs[DS_BB] + // ? codecs[DS_BB]->decode(s, codecs[DS_BB], + // blk, &seq[pos-1], &out_sz2) // : (seq[pos-1] = 'N', out_sz2 = 1, 0); } if (have_sc) { @@ -1319,10 +1330,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (pos-1 < cr->len) seq[pos-1] = 'N'; // FIXME look up BS=base value } @@ -1334,10 +1344,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (r) return -1; if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { if (pos-1 < cr->len) @@ -1376,10 +1385,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_DL) { - if (!c->comp_hdr->codecs[DS_DL]) return -1; - r |= c->comp_hdr->codecs[DS_DL] - ->decode(s, c->comp_hdr->codecs[DS_DL], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_DL]) return -1; + r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, + (char *)&i32, &out_sz); if (r) return r; if (decode_md || decode_nm) { if (ref_pos + i32 > s->ref_end) @@ -1431,11 +1439,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_IN) { - if (!c->comp_hdr->codecs[DS_IN]) return -1; - r |= c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2); + if (!codecs[DS_IN]) return -1; + r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); if (r) return r; cig_op = BAM_CINS; cig_len += out_sz2; @@ -1452,11 +1459,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (r) return r; } cig_op = BAM_CINS; @@ -1475,11 +1481,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_BB) { - if (!c->comp_hdr->codecs[DS_BB]) return -1; - r |= c->comp_hdr->codecs[DS_BB] - ->decode(s, c->comp_hdr->codecs[DS_BB], blk, - cr->len ? &seq[pos-1] : NULL, - &len); + if (!codecs[DS_BB]) return -1; + r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk, + cr->len ? &seq[pos-1] : NULL, + &len); if (r) return r; if (decode_md || decode_nm) { @@ -1526,13 +1531,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_QQ) { - if (!c->comp_hdr->codecs[DS_QQ]) return -1; + if (!codecs[DS_QQ]) return -1; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QQ] - ->decode(s, c->comp_hdr->codecs[DS_QQ], blk, - (char *)&qual[pos-1], &len); + r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, + (char *)&qual[pos-1], &len); if (r) return r; } @@ -1555,11 +1559,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } #endif if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (decode_md || decode_nm) { if (md_dist >= 0 && decode_md) @@ -1579,13 +1582,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; @@ -1601,13 +1603,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'Q': { // Quality score; QS if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); //printf(" %d: QS = %d (ret %d)\n", f, qc, r); } break; @@ -1619,10 +1620,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_HC) { - if (!c->comp_hdr->codecs[DS_HC]) return -1; - r |= c->comp_hdr->codecs[DS_HC] - ->decode(s, c->comp_hdr->codecs[DS_HC], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_HC]) return -1; + r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CHARD_CLIP; cig_len += i32; @@ -1636,10 +1636,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_PD) { - if (!c->comp_hdr->codecs[DS_PD]) return -1; - r |= c->comp_hdr->codecs[DS_PD] - ->decode(s, c->comp_hdr->codecs[DS_PD], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_PD]) return -1; + r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CPAD; cig_len += i32; @@ -1653,10 +1652,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_RS) { - if (!c->comp_hdr->codecs[DS_RS]) return -1; - r |= c->comp_hdr->codecs[DS_RS] - ->decode(s, c->comp_hdr->codecs[DS_RS], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_RS]) return -1; + r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CREF_SKIP; cig_len += i32; @@ -1703,31 +1701,32 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (cr->len - seq_pos + 1 > 0) { if (ref_pos + cr->len-seq_pos +1 > s->ref_end) goto beyond_slice; + int remainder = cr->len - (seq_pos-1); + int j = ref_pos - s->ref_start + 1; if (decode_md || decode_nm) { - int i, j = ref_pos - s->ref_start + 1; - // FIXME: Update this to match spec once we're also - // ready to update samtools calmd. (N vs any ambig) - if (memchr(&s->ref[j], 'N', cr->len - (seq_pos-1))) { - for (i = seq_pos-1, j -= i; i < cr->len; i++) { - char base = s->ref[j+i]; + int i; + char *N = memchr(&s->ref[j], 'N', remainder); + if (!N) { + // short cut the common case + md_dist += cr->len - (seq_pos-1); + } else { + char *refp = &s->ref[j-(seq_pos-1)]; + md_dist += N-&s->ref[j]; + int i_start = seq_pos-1 + (N - &s->ref[j]); + for (i = i_start; i < cr->len; i++) { + char base = refp[i]; if (base == 'N') { - if (add_md_char(s, decode_md, 'N', &md_dist) < 0) + if (add_md_char(s, decode_md, 'N', + &md_dist) < 0) return -1; nm++; } else { md_dist++; } - seq[i] = base; } - } else { - // faster than above code - memcpy(&seq[seq_pos-1], &s->ref[j], cr->len - (seq_pos-1)); - md_dist += cr->len - (seq_pos-1); } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - cr->len - (seq_pos-1)); } + memcpy(&seq[seq_pos-1], &s->ref[j], remainder); } ref_pos += cr->len - seq_pos + 1; } @@ -1782,10 +1781,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); if (ds & CRAM_MQ) { - if (!c->comp_hdr->codecs[DS_MQ]) return -1; - r |= c->comp_hdr->codecs[DS_MQ] - ->decode(s, c->comp_hdr->codecs[DS_MQ], blk, - (char *)&cr->mqual, &out_sz); + if (!codecs[DS_MQ]) return -1; + r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk, + (char *)&cr->mqual, &out_sz); } else { cr->mqual = 40; } @@ -1793,10 +1791,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { int32_t out_sz2 = cr->len; - if (!c->comp_hdr->codecs[DS_QS]) return -1; - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - qual, &out_sz2); + if (!codecs[DS_QS]) return -1; + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + qual, &out_sz2); } s->cigar = cigar; diff --git a/cram/cram_io.h b/cram/cram_io.h index 8cc59be51..53ae30f59 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -229,7 +229,7 @@ static inline int block_resize(cram_block *b, size_t len) { size_t alloc = b->alloc; while (alloc <= len) - alloc = alloc ? alloc*1.5 : 1024; + alloc = alloc ? alloc + (alloc>>2) : 1024; return block_resize_exact(b, alloc); } diff --git a/sam.c b/sam.c index a93664d87..9f415dbc6 100644 --- a/sam.c +++ b/sam.c @@ -598,9 +598,19 @@ int bam_set1(bam1_t *bam, } cp += n_cigar * 4; - for (i = 0; i + 1 < l_seq; i += 2) { - *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; +#define NN 16 + const uint8_t *useq = (uint8_t *)seq; + for (i = 0; i + NN < l_seq; i += NN) { + int j; + const uint8_t *u2 = useq+i; + for (j = 0; j < NN/2; j++) + cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; + cp += NN/2; } + for (; i + 1 < l_seq; i += 2) { + *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; + } + for (; i < l_seq; i++) { *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; } From c1634e743aab4822e05fbb7dc41fd6ab21ec6982 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Mar 2023 17:30:41 +0000 Subject: [PATCH 10/70] Remove CRAM 3.1 warning. The code would now trigger on 3.2 as well as 4.x, although 3.2 doesn't exist. It's easier to check this way though as a future-proof. --- cram/cram_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index d3c39e47a..4f5aab44c 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5776,7 +5776,7 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { return -1; } - if (major > 3 || (major == 3 && minor > 0)) { + if (major > 3 || (major == 3 && minor > 1)) { hts_log_warning( "CRAM version %s is still a draft and subject to change.\n" "This is a technology demonstration that should not be " From ffd74ecbfdd527e30da4a72e3ecfe069805012a2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Mar 2023 17:35:10 +0100 Subject: [PATCH 11/70] Remove use of sprintf() from HTSlib source None of these instances were really a problem, but using it upsets some downstream packagers (notably R). The easiest way to keep them happy is to stop using it and (mostly) switch to snprintf() instead. Also remove some code from hfile_s3's escape_query() which could never be executed. --- cram/cram_index.c | 21 ++++++++++--------- cram/cram_io.c | 6 +++--- cram/open_trace_file.c | 2 +- hfile_s3.c | 40 +++++++++++++++++------------------- kstring.c | 2 +- plugin.c | 2 +- sam.c | 20 +++++++++--------- test/hfile.c | 2 +- test/sam.c | 2 +- test/test-regidx.c | 22 ++++++++++---------- test/test_mod.c | 46 +++++++++++++++++++++--------------------- test/test_view.c | 3 ++- textutils.c | 4 ++-- 13 files changed, 87 insertions(+), 85 deletions(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 846dc709d..0cc606f42 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -656,9 +656,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -669,9 +670,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -701,9 +703,10 @@ int cram_index_slice(cram_fd *fd, if (s->hdr->ref_seq_id == -2) { ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); } else { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - s->hdr->ref_seq_id, s->hdr->ref_seq_start, - s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; } diff --git a/cram/cram_io.c b/cram/cram_io.c index 4f5aab44c..ca226e29c 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2531,7 +2531,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { /* Only the reference file provided. Get the index file name from it */ if (!(r->fn = string_dup(r->pool, fn))) goto err; - sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn); + snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn); } } @@ -4816,7 +4816,7 @@ static void full_path(char *out, char *in) { strncpy(out, in, PATH_MAX-1); out[PATH_MAX-1] = 0; } else { - int len; + size_t len; // unable to get dir or out+in is too long if (!getcwd(out, PATH_MAX) || @@ -4826,7 +4826,7 @@ static void full_path(char *out, char *in) { return; } - sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in); + snprintf(out+len, PATH_MAX - len, "/%s", in); // FIXME: cope with `pwd`/../../../foo.fa ? } diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 1518396d7..4d617b736 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -242,7 +242,7 @@ static char *expand_path(const char *file, char *dirname, int max_s_digits) { /* Special case for "./" or absolute filenames */ if (*file == '/' || (len==1 && *dirname == '.')) { - sprintf(path, "%s", file); + memcpy(path, file, lenf + 1); } else { /* Handle %[0-9]*s expansions, if required */ char *path_end = path; diff --git a/hfile_s3.c b/hfile_s3.c index ce83875c9..2ce7feb4b 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -451,12 +451,12 @@ static int auth_header_callback(void *ctx, char ***hdrs) { /* like a escape path but for query strings '=' and '&' are untouched */ static char *escape_query(const char *qs) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(qs); - - if ((escaped = malloc(length * 3 + 1)) == NULL) { + alloced = length * 3 + 1; + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -467,29 +467,25 @@ static char *escape_query(const char *qs) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/' || c == '=' || c == '&') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } - if (i != length) { - // in the case of a '?' copy the rest of the qs across unchanged - strcpy(escaped + j, qs + i); - } else { - escaped[j] = '\0'; - } + escaped[j] = '\0'; return escaped; } static char *escape_path(const char *path) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(path); + alloced = length * 3 + 1; - if ((escaped = malloc(length * 3 + 1)) == NULL) { + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -502,7 +498,7 @@ static char *escape_path(const char *path) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } @@ -842,14 +838,14 @@ AWS S3 sig version 4 writing code ****************************************************************/ -static void hash_string(char *in, size_t length, char *out) { +static void hash_string(char *in, size_t length, char *out, size_t out_len) { unsigned char hashed[SHA256_DIGEST_BUFSIZE]; int i, j; s3_sha256((const unsigned char *)in, length, hashed); for (i = 0, j = 0; i < SHA256_DIGEST_BUFSIZE; i++, j+= 2) { - sprintf(out + j, "%02x", hashed[i]); + snprintf(out + j, out_len - j, "%02x", hashed[i]); } } @@ -866,7 +862,7 @@ static void ksfree(kstring_t *s) { } -static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string) { +static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string, size_t sig_string_len) { unsigned char date_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_service_key[SHA256_DIGEST_BUFSIZE]; @@ -893,7 +889,7 @@ static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *sig s3_sign_sha256(signing_key, len, (const unsigned char *)string_to_sign->s, string_to_sign->l, signature, &len); for (i = 0, j = 0; i < len; i++, j+= 2) { - sprintf(signature_string + j, "%02x", signature[i]); + snprintf(signature_string + j, sig_string_len - j, "%02x", signature[i]); } ksfree(&secret_access_key); @@ -945,7 +941,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - hash_string(canonical_request.s, canonical_request.l, cr_hash); + hash_string(canonical_request.s, canonical_request.l, cr_hash, sizeof(cr_hash)); ksprintf(&scope, "%s/%s/s3/aws4_request", ad->date_short, ad->region.s); @@ -959,7 +955,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - if (make_signature(ad, &string_to_sign, signature_string)) { + if (make_signature(ad, &string_to_sign, signature_string, sizeof(signature_string))) { goto cleanup; } @@ -1094,10 +1090,10 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co } if (content) { - hash_string(content->s, content->l, content_hash); + hash_string(content->s, content->l, content_hash, sizeof(content_hash)); } else { // empty hash - hash_string("", 0, content_hash); + hash_string("", 0, content_hash, sizeof(content_hash)); } ad->canonical_query_string.l = 0; @@ -1166,7 +1162,7 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { return copy_auth_headers(ad, hdrs); } - hash_string("", 0, content_hash); // empty hash + hash_string("", 0, content_hash, sizeof(content_hash)); // empty hash ad->canonical_query_string.l = 0; diff --git a/kstring.c b/kstring.c index 9b2d60c1f..71facf975 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = sprintf(s->s + s->l, "%g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); len += s2; s->l += s2; return len; diff --git a/plugin.c b/plugin.c index cec5beefd..670081f84 100644 --- a/plugin.c +++ b/plugin.c @@ -210,7 +210,7 @@ const char *hts_plugin_path(void) { } static char s_path[1024]; - sprintf(s_path, "%.1023s", ks.s ? ks.s : ""); + snprintf(s_path, sizeof(s_path), "%s", ks.s ? ks.s : ""); free(ks.s); return s_path; diff --git a/sam.c b/sam.c index 9f415dbc6..c8daa9683 100644 --- a/sam.c +++ b/sam.c @@ -5392,20 +5392,22 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, for (j = 0; j < nm; j++) { char qual[20]; if (mod[j].qual >= 0) - sprintf(qual, "%d", mod[j].qual); + snprintf(qual, sizeof(qual), "%d", mod[j].qual); else *qual=0; if (mod[j].modified_base < 0) // ChEBI - indel += sprintf(&ins->s[indel], "%c(%d)%s", - "+-"[mod[j].strand], - -mod[j].modified_base, - qual); + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c(%d)%s", + "+-"[mod[j].strand], + -mod[j].modified_base, + qual); else - indel += sprintf(&ins->s[indel], "%c%c%s", - "+-"[mod[j].strand], - mod[j].modified_base, - qual); + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c%c%s", + "+-"[mod[j].strand], + mod[j].modified_base, + qual); } ins->s[indel++] = ']'; ins->l += indel - o_indel; // grow by amount we used diff --git a/test/hfile.c b/test/hfile.c index f6ba0d7cf..8f06a971f 100644 --- a/test/hfile.c +++ b/test/hfile.c @@ -176,7 +176,7 @@ int main(void) original = slurp("vcf.c"); for (i = 1; i <= 6; i++) { char *text; - sprintf(buffer, "test/hfile%d.tmp", i); + snprintf(buffer, sizeof(buffer), "test/hfile%d.tmp", i); text = slurp(buffer); if (strcmp(original, text) != 0) { fprintf(stderr, "%s differs from vcf.c\n", buffer); diff --git a/test/sam.c b/test/sam.c index 28ca1bc5f..eb404bd65 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1504,7 +1504,7 @@ static void faidx1(const char *filename) fin = fopen(filename, "rb"); if (fin == NULL) fail("can't open %s", filename); - sprintf(tmpfilename, "%s.tmp", filename); + snprintf(tmpfilename, sizeof(tmpfilename), "%s.tmp", filename); fout = fopen(tmpfilename, "wb"); if (fout == NULL) fail("can't create temporary %s", tmpfilename); while (fgets(line, sizeof line, fin)) { diff --git a/test/test-regidx.c b/test/test-regidx.c index 90e7244d1..4cad440c7 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -304,20 +304,20 @@ void test_explicit(char *tgt, char *qry, char *exp) regidx_destroy(idx); } -void create_line_bed(char *line, char *chr, int start, int end) +void create_line_bed(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start-1,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start-1,end); } -void create_line_tab(char *line, char *chr, int start, int end) +void create_line_tab(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start,end); } -void create_line_reg(char *line, char *chr, int start, int end) +void create_line_reg(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s:%d-%d\n",chr,start,end); + snprintf(line,size,"%s:%d-%d\n",chr,start,end); } -typedef void (*set_line_f)(char *line, char *chr, int start, int end); +typedef void (*set_line_f)(char *line, size_t size, char *chr, int start, int end); void test(set_line_f set_line, regidx_parse_f parse) { @@ -329,17 +329,17 @@ void test(set_line_f set_line, regidx_parse_f parse) for (i=1; icore.l_qseq; i++) { - char line[8192], *lp = line; + char line[8192], *lp = line, *ep = line + sizeof(line); n = bam_mods_at_next_pos(b, m, mods, 5); - lp += sprintf(lp, "%d\t%c\t", - i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + lp += snprintf(lp, ep - lp, "%d\t%c\t", + i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { if (extended) { int m_strand, m_implicit; @@ -134,18 +134,18 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += sprintf(lp, "%c%c%s%c%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - "?."[m_implicit], - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%c%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + mods[j].qual); } else { - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); } } *lp++ = '\n'; @@ -172,15 +172,15 @@ int main(int argc, char **argv) { int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { - char line[8192]={0}, *lp = line; - lp += sprintf(lp, "%d\t%c\t", pos, - seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + char line[8192]={0}, *lp = line, *ep = line + sizeof(line); + lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); } *lp++ = '\n'; *lp++ = 0; diff --git a/test/test_view.c b/test/test_view.c index f33c1cdf0..02d109297 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -362,7 +362,8 @@ int main(int argc, char *argv[]) } strcpy(modew, "w"); - if (opts.clevel >= 0 && opts.clevel <= 9) sprintf(modew + 1, "%d", opts.clevel); + if (opts.clevel >= 0 && opts.clevel <= 9) + snprintf(modew + 1, sizeof(modew) - 1, "%d", opts.clevel); if (opts.flag & WRITE_CRAM) strcat(modew, "c"); else if (opts.flag & WRITE_BINARY_COMP) strcat(modew, "b"); else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); diff --git a/textutils.c b/textutils.c index 53a3b252d..0cc2af818 100644 --- a/textutils.c +++ b/textutils.c @@ -453,7 +453,7 @@ const char * hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) { const char *slim = (len < SIZE_MAX)? &s[len] : NULL; - char *t = buf; + char *t = buf, *bufend = buf + buflen; size_t qlen = quote? 1 : 0; if (quote) *t++ = quote; @@ -482,7 +482,7 @@ hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) } if (clen == 4) { - sprintf(t, "\\x%02X", (unsigned char) c); + snprintf(t, bufend - t, "\\x%02X", (unsigned char) c); t += clen; } else { From 7ed911e0ba88bb85688ece3dfb4b5a43186ede9b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 30 Mar 2023 09:40:01 +0100 Subject: [PATCH 12/70] Trivial fix to expr, removing "^". Fixes #1592 --- version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sh b/version.sh index 65d1ccae6..ba900bdf1 100755 --- a/version.sh +++ b/version.sh @@ -48,7 +48,7 @@ then v1=`expr "$VERSION" : '\([0-9]*\)'` v2=`expr "$VERSION" : '[0-9]*.\([0-9]*\)'` v3=`expr "$VERSION" : '[0-9]*.[0-9]*.\([0-9]*\)'` - if [ -z "`expr "$VERSION" : '^\([0-9.]*\)$'`" ] + if [ -z "`expr "$VERSION" : '\([0-9.]*\)$'`" ] then VERSION="$v1.$v2.255" else From 26f4d4465c9452ca698be79809d7a00d4c377d54 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 15 Mar 2023 11:45:49 +0000 Subject: [PATCH 13/70] Make SIMD tests work when building multiarch binaries MacOS multiarch binaries compile source code for each architecture and then join them together using 'lipo'. This means architecture specific code both in the actual source and configure tests need to be compilable on both architectures. Switch the configure tests and hts_probe_cc.sh so that they check if a given flag is needed to compile the test code instead of just testing to see if the flag works. By adding #ifdef __x86_64__ guards around the test code, compilation will work on non-x86_64 returning the result that no special compiler flag is needed. Similar #ifdef guards are added to the source files so that the SIMD-specific code only gets compiled for x86_64. The htscodecs submodule is updated to pull in these source file changes. The SIMD parts of built-in htscodecs are now compiled unconditionally. Tests for NEON have also been removed as they weren't really doing anything. The configure and hts_probe_cc.sh are adjusted to exactly match those used by htscodecs' configure, for ease of maintenance. --- Makefile | 16 +++- configure.ac | 122 ++++++++++++++++----------- hts_probe_cc.sh | 103 ++++++++++++++-------- htscodecs | 2 +- htscodecs_bundled.mk | 8 +- m4/ax_check_compile_flag.m4 | 53 ------------ m4/hts_check_compile_flags_needed.m4 | 63 ++++++++++++++ 7 files changed, 218 insertions(+), 149 deletions(-) delete mode 100644 m4/ax_check_compile_flag.m4 create mode 100644 m4/hts_check_compile_flags_needed.m4 diff --git a/Makefile b/Makefile index 3e95a0bef..9b7f7f2f4 100644 --- a/Makefile +++ b/Makefile @@ -126,10 +126,18 @@ srcdir = . srcprefix = HTSPREFIX = +# Flags for SIMD code HTS_CFLAGS_AVX2 = HTS_CFLAGS_AVX512 = HTS_CFLAGS_SSE4 = +# Control building of SIMD code. Not used if configure has been run. +HTS_BUILD_AVX2 = +HTS_BUILD_AVX512 = +HTS_BUILD_SSSE3 = +HTS_BUILD_POPCNT = +HTS_BUILD_SSE4_1 = + include htslib_vars.mk include htscodecs.mk @@ -274,7 +282,9 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ - if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_POPCNT)" != "x" ] && \ + [ "x$(HTS_BUILD_SSE4_1)" != "x" ] && \ + [ "x$(HTS_BUILD_SSSE3)" != "x" ]; then \ echo '#define HAVE_POPCNT 1' >> $@ ; \ echo '#define HAVE_SSE4_1 1' >> $@ ; \ echo '#define HAVE_SSSE3 1' >> $@ ; \ @@ -282,10 +292,10 @@ config.h: echo '#define UBSAN 1' >> $@ ; \ echo '#endif' >> $@ ; \ fi - if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_AVX2)" != "x" ] ; then \ echo '#define HAVE_AVX2 1' >> $@ ; \ fi - if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \ echo '#define HAVE_AVX512 1' >> $@ ; \ fi diff --git a/configure.ac b/configure.ac index 98b0a44c7..ff2367c1b 100644 --- a/configure.ac +++ b/configure.ac @@ -30,7 +30,7 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) -m4_include([m4/ax_check_compile_flag.m4]) +m4_include([m4/hts_check_compile_flags_needed.m4]) m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) @@ -71,16 +71,53 @@ dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) dnl Check for various compiler flags to enable SIMD features -dnl Options for rANS32x16 sse4.1 version -AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [ - hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1" - AC_SUBST([hts_cflags_sse4]) - AC_DEFINE([HAVE_SSSE3],1, - [Defined to 1 if the compiler can issue SSSE3 instructions.]) - AC_DEFINE([HAVE_POPCNT],1, - [Defined to 1 if the compiler can issue popcnt instructions.]) - AC_DEFINE([HAVE_SSE4_1],1, - [Defined to 1 if the compiler can issue SSE4.1 instructions.]) +dnl Options for rANS32x16 sse4.1 version - ssse3 +hts_cflags_sse4="" +HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - popcnt +HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + unsigned int i = _mm_popcnt_u32(1); + return i != 1; + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - sse4.1 +HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled. +]) dnl Propagate HTSlib's unaligned access preference to htscodecs AH_VERBATIM([UBSAN],[ /* Prevent unaligned access in htscodecs SSE4 rANS codec */ @@ -89,60 +126,43 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs #endif ]) AC_DEFINE([UBSAN],1,[]) - ], [], [], [AC_LANG_PROGRAM([[ - #include "x86intrin.h" - ]],[[ - unsigned int i = _mm_popcnt_u32(1); - __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); - __m128i c = _mm_max_epu32(a, b); - b = _mm_shuffle_epi8(a, c); - return *((char *) &b); - ]])]) +]) +AC_SUBST([hts_cflags_sse4]) dnl Options for rANS32x16 avx2 version -AX_CHECK_COMPILE_FLAG([-mavx2], [ - hts_cflags_avx2="-mavx2" - AC_SUBST([hts_cflags_avx2]) - AC_DEFINE([HAVE_AVX2],1, - [Defined to 1 if the compiler can issue AVX2 instructions.]) - ], [], [], [AC_LANG_PROGRAM([[ - #include "x86intrin.h" +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif ]],[[ + #ifdef __x86_64__ __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); __m256i b = _mm256_add_epi32(a, a); long long c = _mm256_extract_epi64(b, 0); return (int) c; - ]])]) + #endif + ]])], [ + hts_cflags_avx2="$flags_needed" + AC_SUBST([hts_cflags_avx2]) + AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.]) +]) dnl Options for rANS32x16 avx512 version -AX_CHECK_COMPILE_FLAG([-mavx512f], [ - hts_cflags_avx512="-mavx512f" - AC_SUBST([hts_cflags_avx512]) - AC_DEFINE([HAVE_AVX512],1, - [Defined to 1 if the compiler can issue AVX512 instructions.]) - ], [], [], [AC_LANG_PROGRAM([[ +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ #include "x86intrin.h" + #endif ]],[[ + #ifdef __x86_64__ __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); return *((char *) &b); - ]])]) - -dnl Detect ARM Neon availability -AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [ - AC_COMPILE_IFELSE([ - AC_LANG_PROGRAM([[ - #include "arm_neon.h" - ]], [[ - int32x4_t a = vdupq_n_s32(1); - int32x4_t b = vaddq_s32(a, a); - return *((char *) &b); - ]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])]) -if test "$hts_cv_have_neon" = yes; then - hts_have_neon=yes - AC_SUBST([hts_have_neon]) -fi - + #endif + ]])], [ + hts_cflags_avx512="$flags_needed" + AC_SUBST([hts_cflags_avx512]) + AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) +]) dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 37d6bae7e..5e5ddec1e 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -43,30 +43,76 @@ run_compiler () return $retval } +# Run a test. $1 is the flag to try, $2 is the Makefile variable to set +# with the flag probe result, $3 is a Makefile variable which will be +# set to 1 if the code was built successfully. The code to test should +# be passed in via fd 0. +# First try compiling conftest.c without the flag. If that fails, try +# again with it to see if the flag is needed. +run_test () +{ + rm -f conftest conftest.err conftest.c + cat - > conftest.c + if run_compiler ; then + echo "$2 =" + echo "$3 = 1" + elif run_compiler "$1" ; then + echo "$2 = $1" + echo "$3 = 1" + else + echo "$3 =" + fi +} + echo "# Compiler probe results, generated by $0" -# Check for sse4.1 etc. support +# Check for ssse3 +run_test "-mssse3" HTS_CFLAGS_SSSE3 HTS_BUILD_SSSE3 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF -rm -f conftest conftest.err conftest.c -cat - <<'EOF' > conftest.c +# Check for popcnt +run_test "-mpopcnt" HTS_CFLAGS_POPCNT HTS_BUILD_POPCNT <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { unsigned int i = _mm_popcnt_u32(1); - __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + return i != 1; +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +# Check for sse4.1 etc. support +run_test "-msse4.1" HTS_CFLAGS_SSE4_1 HTS_BUILD_SSE4_1 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); __m128i c = _mm_max_epu32(a, b); - b = _mm_shuffle_epi8(a, c); - return *((char *) &b); + return *((char *) &c); } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mpopcnt -msse4.1 -mssse3" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_SSE4 = $FLAGS" -fi + +echo 'HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSSE3) $(HTS_CFLAGS_POPCNT) $(HTS_CFLAGS_SSE4_1)' # Check for avx2 -rm -f conftest.c -cat - <<'EOF' > conftest.c +run_test -mavx2 HTS_CFLAGS_AVX2 HTS_BUILD_AVX2 <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); @@ -74,41 +120,24 @@ int main(int argc, char **argv) { long long c = _mm256_extract_epi64(b, 0); return (int) c; } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mavx2" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_AVX2 = $FLAGS" -fi # Check for avx512 -rm -f conftest.c -cat - <<'EOF' > conftest.c +run_test -mavx512f HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); return *((char *) &b); } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mavx512f" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_AVX512 = $FLAGS" -fi - -# Check for neon - -rm -f conftest.c -cat - <<'EOF' > conftest.c -#include "arm_neon.h" -int main(int argc, char **argv) { - int32x4_t a = vdupq_n_s32(1); - int32x4_t b = vaddq_s32(a, a); - return *((char *) &b); -} -EOF -if run_compiler "" ; then - echo "HTS_HAVE_NEON = yes" -fi rm -f conftest.c diff --git a/htscodecs b/htscodecs index cd0737fff..d4aed5859 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c +Subproject commit d4aed585929e2dab9dd8e6a2b74484dfc347c0f2 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 91a9c39e9..6274350f5 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -28,10 +28,10 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ - $(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \ - $(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \ - $(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \ - $(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ $(HTSPREFIX)htscodecs/htscodecs/rle.c \ diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4 deleted file mode 100644 index 16bb46495..000000000 --- a/m4/ax_check_compile_flag.m4 +++ /dev/null @@ -1,53 +0,0 @@ -# =========================================================================== -# https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) -# -# DESCRIPTION -# -# Check whether the given FLAG works with the current language's compiler -# or gives an error. (Warnings, however, are ignored) -# -# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on -# success/failure. -# -# If EXTRA-FLAGS is defined, it is added to the current language's default -# flags (e.g. CFLAGS) when the check is done. The check is thus made with -# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to -# force the compiler to issue an error when a bad flag is given. -# -# INPUT gives an alternative input source to AC_COMPILE_IFELSE. -# -# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this -# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. -# -# LICENSE -# -# Copyright (c) 2008 Guido U. Draheim -# Copyright (c) 2011 Maarten Bosmans -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. - -#serial 6 - -AC_DEFUN([AX_CHECK_COMPILE_FLAG], -[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF -AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl -AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ - ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS - _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" - AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], - [AS_VAR_SET(CACHEVAR,[yes])], - [AS_VAR_SET(CACHEVAR,[no])]) - _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) -AS_VAR_IF(CACHEVAR,yes, - [m4_default([$2], :)], - [m4_default([$3], :)]) -AS_VAR_POPDEF([CACHEVAR])dnl -])dnl AX_CHECK_COMPILE_FLAGS diff --git a/m4/hts_check_compile_flags_needed.m4 b/m4/hts_check_compile_flags_needed.m4 new file mode 100644 index 000000000..fb668e86f --- /dev/null +++ b/m4/hts_check_compile_flags_needed.m4 @@ -0,0 +1,63 @@ +# hts_check_compile_flags_needed.m4 +# +# SYNOPSIS +# +# HTS_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAGS, [INPUT], [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAGS are required to build and link INPUT with +# the current language's compiler. Compilation and linking are first +# tries without FLAGS. If that fails it then tries to compile and +# link again with FLAGS. +# +# FEATURE describes the feature being tested, and is used when printing +# messages and to name the cache entry (along with the tested flags). +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. In ACTION-SUCCESS, $flags_needed will be set to +# either an empty string or FLAGS depending on the test results. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# If omitted, INPUT defaults to AC_LANG_PROGRAM(), although that probably +# isn't very useful. +# +# NOTE: Implementation based on AX_CHECK_COMPILE_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# Copyright (c) 2023 Robert Davies +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +# AX_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) + +AC_DEFUN([HTS_CHECK_COMPILE_FLAGS_NEEDED], +[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF +AS_VAR_PUSHDEF([CACHEVAR],[hts_cv_check_[]_AC_LANG_ABBREV[]flags_needed_$1_$6_$2])dnl +AC_CACHE_CHECK([_AC_LANG compiler flags needed for $1], CACHEVAR, [ + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[none])], + [ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $6 $2" + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[$2])], + [AS_VAR_SET(CACHEVAR,[unsupported])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])]) +AS_VAR_IF(CACHEVAR,unsupported, [ + m4_default([$5], :) +], [ + AS_VAR_IF(CACHEVAR,none,[flags_needed=""], [flags_needed="$CACHEVAR"]) + m4_default([$4], :) +]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl HTS_CHECK_COMPILE_FLAGS_NEEDED From 3e0fd29c75fcabaa01fa5f0f34adb262c1825371 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 22 Mar 2023 17:22:49 +0000 Subject: [PATCH 14/70] Make MacOS tests build a multiarch version of the library --- .cirrus.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index dc93b071d..fc4405b08 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,7 @@ libdeflate_template: &LIBDEFLATE pushd "$HOME" git clone --depth 1 https://github.com/ebiggers/libdeflate.git pushd libdeflate - cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS='-g -O3 -fPIC' + cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS="-g -O3 -fPIC $LIBDEFLATE_CFLAGS" cmake --build build --verbose popd popd @@ -186,6 +186,9 @@ macosx_task: environment: CC: clang + CFLAGS: "-Wall -arch arm64 -arch x86_64" + LDFLAGS: "-arch arm64 -arch x86_64" + LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64" LC_ALL: C CIRRUS_CLONE_DEPTH: 1 From 93434e041c53860398044ab6c0735230b389278b Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 14:36:29 +0000 Subject: [PATCH 15/70] Fix bug where bin number could overflow when looking for max_off When searching for `max_off`, hts_itr_query() and hts_itr_multi_bam() look for a bin to the right of the end of the region. For whole chromosomes, this would be HTS_POS_MAX, which is far beyond the maximum bin position supported. The `bin` calculation overflowed leading to either a negative bin number or an incorrect positive one, depending on the number of levels in the index. Negative bin numbers simply caused time to be wasted as the search loop eventually counted up to zero, but incorrect positive ones could cause the iterator to finish too early. Fix by catching the out-of-bounds case and setting max_off to UINT64_MAX, whch should be used for bins beyond the end of the indexable range. --- hts.c | 62 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/hts.c b/hts.c index 86b5bb877..0d5abf53a 100644 --- a/hts.c +++ b/hts.c @@ -3145,17 +3145,24 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } - bin++; + // First check if end lies within the range of the index (it won't + // if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } + bin++; + } + } else { + // Searching to end of reference + max_off = UINT64_MAX; } // retrieve bins @@ -3314,20 +3321,27 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { - max_off = kh_val(bidx, k).list[0].u; - break; + // First check if end lies within the range of the index (it + // won't if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { + max_off = kh_val(bidx, k).list[0].u; + break; + } + bin++; } - bin++; + } else { + // Searching to end of reference + max_off = UINT64_MAX; } //convert coordinates to file offsets From a616e851373a73cdd30ae69ad92351465d1419ea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Apr 2023 11:56:33 +0100 Subject: [PATCH 16/70] Add MZ:i tag as a check for base modification validity. (#1590) If a sequence is hard-clipped after calling the base modifications, then the tool may, or may not, update the MM and ML tags accordingly. We have no way of distinguishing these two cases. While the base modification parsing code already detects overflows where the coordinates go beyond the sequence end, this isn't fool proof, especially if the clipping is short. So instead we have an (as yet unwritten) proposal of MZ:i tag holding the sequence length, to be written at the same time as the MM and ML tags. This can then be used as a sanity check later on, to detect cases where the sequence has changed length via a tool that is unaware of base modifications. TODO: as a separate PR, we should add a new API that can trim bases off the start/end of MM/ML strings to make it trivial for tools that are doing hard clipping via htslib. (Indeed we don't even have an API for SEQ/QUAL either, so it can do all together). This would make it far easier for people to keep everything in sync, and this code could then also update MZ while it's at it. That's new API though so it can arrive as a separate commit. See https://github.com/samtools/hts-specs/issues/646 --- sam.c | 30 ++++++++++++++++++++++-------- test/base_mods/MM-MZf1.sam | 5 +++++ test/base_mods/MM-MZf2.sam | 5 +++++ test/base_mods/MM-MZp.sam | 5 +++++ test/base_mods/MM-multi.sam | 2 +- test/base_mods/base-mods.sh | 1 + test/base_mods/base-mods.tst | 6 ++++++ test/pileup_mod.c | 7 ++++--- 8 files changed, 49 insertions(+), 12 deletions(-) create mode 100644 test/base_mods/MM-MZf1.sam create mode 100644 test/base_mods/MM-MZf2.sam create mode 100644 test/base_mods/MM-MZp.sam diff --git a/sam.c b/sam.c index c8daa9683..8f135c333 100644 --- a/sam.c +++ b/sam.c @@ -6221,14 +6221,24 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { if (!mm) return 0; if (mm[0] != 'Z') { - hts_log_error("MM tag is not of type Z"); + hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); + return -1; + } + + uint8_t *mi = bam_aux_get(b, "MZ"); + if (mi && bam_aux2i(mi) != b->core.l_qseq) { + // bam_aux2i with set errno = EINVAL and return 0 if the tag + // isn't integer, but 0 will be a seq-length mismatch anyway so + // triggers an error here too. + hts_log_error("%s: MM/MZ data length is incompatible with" + " SEQ length", bam_get_qname(b)); return -1; } uint8_t *ml = bam_aux_get(b, "ML"); if (!ml) ml = bam_aux_get(b, "Ml"); if (ml && (ml[0] != 'B' || ml[1] != 'C')) { - hts_log_error("ML tag is not of type B,C"); + hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); return -1; } uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; @@ -6314,7 +6324,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { delta = strtol(cp, &cp_end, 10); if (cp_end == cp) { - hts_log_error("Hit end of MM tag. Missing semicolon?"); + hts_log_error("%s: Hit end of MM tag. Missing " + "semicolon?", bam_get_qname(b)); return -1; } @@ -6343,8 +6354,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->implicit [mod_num] = implicit; if (delta < 0) { - hts_log_error("MM tag refers to bases beyond sequence " - "length"); + hts_log_error("%s: MM tag refers to bases beyond sequence " + "length", bam_get_qname(b)); return -1; } state->MMcount [mod_num] = delta; @@ -6359,7 +6370,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { } if (++mod_num >= MAX_BASE_MOD) { - hts_log_error("Too many base modification types"); + hts_log_error("%s: Too many base modification types", + bam_get_qname(b)); return -1; } ms++; n++; @@ -6377,7 +6389,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { } } if (ml > ml_end) { - hts_log_error("Insufficient number of entries in ML tag"); + hts_log_error("%s: Insufficient number of entries in ML " + "tag", bam_get_qname(b)); return -1; } } else { @@ -6389,7 +6402,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { cp++; } if (!*cp) { - hts_log_error("Hit end of MM tag. Missing semicolon?"); + hts_log_error("%s: Hit end of MM tag. Missing semicolon?", + bam_get_qname(b)); return -1; } } diff --git a/test/base_mods/MM-MZf1.sam b/test/base_mods/MM-MZf1.sam new file mode 100644 index 000000000..35074fd05 --- /dev/null +++ b/test/base_mods/MM-MZf1.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:37 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 diff --git a/test/base_mods/MM-MZf2.sam b/test/base_mods/MM-MZf2.sam new file mode 100644 index 000000000..843f93a1b --- /dev/null +++ b/test/base_mods/MM-MZf2.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:f:36 diff --git a/test/base_mods/MM-MZp.sam b/test/base_mods/MM-MZp.sam new file mode 100644 index 000000000..836a09725 --- /dev/null +++ b/test/base_mods/MM-MZp.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam index b2259a09e..1c7288f50 100644 --- a/test/base_mods/MM-multi.sam +++ b/test/base_mods/MM-multi.sam @@ -3,5 +3,5 @@ @CO r2 has them combined together, for example as produced by @CO a joint basecaller which assigns probabilities to all @CO trained events simultaneously. -r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh index f3f3ca4b7..388ff369e 100755 --- a/test/base_mods/base-mods.sh +++ b/test/base_mods/base-mods.sh @@ -31,5 +31,6 @@ test_mod="../test_mod" pileup_mod="../pileup_mod" test_driver $@ +rm _err.tmp _out.tmp exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 3809c0e6e..d246223c8 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -42,3 +42,9 @@ P MM-explicit-x.out $test_mod -x MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam + +# Validation testing. We just care about exit status here, but the +# test data is a copy of MM-pileup.sam so that suffices too. +P MM-pileup.out $pileup_mod < MM-MZp.sam +F MM-pileup.out $pileup_mod < MM-MZf1.sam +F MM-pileup.out $pileup_mod < MM-MZf2.sam diff --git a/test/pileup_mod.c b/test/pileup_mod.c index 95c353771..323c0c6c2 100644 --- a/test/pileup_mod.c +++ b/test/pileup_mod.c @@ -73,7 +73,8 @@ void process_pileup(sam_hdr_t *h, const bam_pileup1_t *p, // as each new read is added or removed from the pileups. int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { hts_base_mod_state *m = hts_base_mod_state_alloc(); - bam_parse_basemod(b, m); + if (bam_parse_basemod(b, m) < 0) + return -1; cd->p = m; return 0; } @@ -201,7 +202,7 @@ int main(int argc, char **argv) { bam_plp_destructor(iter, pileup_cd_destroy); const bam_pileup1_t *p; - int tid, pos, n; + int tid, pos, n = 0; while ((p = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { switch (compact) { case 0: @@ -221,5 +222,5 @@ int main(int argc, char **argv) { bam_destroy1(b); sam_hdr_destroy(h); - return 0; + return n != 0; } From dba4bdf1becefd4e5632b682da017e3effc7dc17 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Mar 2023 10:03:53 +0100 Subject: [PATCH 17/70] Make reg2bins faster on whole-chromosome queries It's faster to iterate through the index hash table when requesting whole chromosomes, compared to the old behaviour of iterating through all the bins that could exist and looking them up in the hash table to see if they're present. The latter method works better for narrow ranges though, so we choose which to use based on the number of bins covering the range compared to the number in the index. --- hts.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/hts.c b/hts.c index 0d5abf53a..89d12f66b 100644 --- a/hts.c +++ b/hts.c @@ -2903,25 +2903,80 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) ****************/ // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; - if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; - if (itr->bins.n + n > itr->bins.m) { - itr->bins.m = itr->bins.n + n; - kroundup32(itr->bins.m); - itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); + int i; + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + if (kh_get(bin, bidx, i) != kh_end(bidx)) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = i; + } } - for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; } return itr->bins.n; } +static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = bin; + } + } + return itr->bins.n; +} + +static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + int l, t, s = min_shift + (n_lvls<<1) + n_lvls; + size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins; + hts_pos_t end1; + if (end >= 1LL<= end) return 0; + end1 = end - 1; + + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } + max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx); + if (itr->bins.m - itr->bins.n < max_bins) { + // Worst-case memory usage. May be wasteful on very sparse + // data, but the bin list usually won't be too big anyway. + size_t new_m = max_bins + itr->bins.n; + if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) { + errno = ENOMEM; + return -1; + } + int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a)); + if (!new_a) return -1; + itr->bins.a = new_a; + itr->bins.m = new_m; + } + if (reg_bin_count < hash_bin_count) { + return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx); + } else { + return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx); + } +} + static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; @@ -3166,7 +3221,10 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } // retrieve bins - reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) { + hts_itr_destroy(iter); + return NULL; + } for (i = n_off = 0; i < iter->bins.n; ++i) if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) From 07638e1cac22e76c2f59c9988feabd467a15c340 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Mar 2023 13:33:42 +0100 Subject: [PATCH 18/70] Make reg2intervals() faster on whole-chromosome queries As for reg2bins(), it may be faster to iterate through the hash table entries, depending on the number and how wide the region being searched is. --- hts.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 32 deletions(-) diff --git a/hts.c b/hts.c index 89d12f66b..b3f6b9a08 100644 --- a/hts.c +++ b/hts.c @@ -2977,54 +2977,127 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi } } +static inline int add_to_interval(hts_itr_t *iter, bins_t *bin, + int tid, uint32_t interval, + uint64_t min_off, uint64_t max_off) +{ + hts_pair64_max_t *off; + int j; + + if (!bin->n) + return 0; + off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off)); + if (!off) + return -2; + + iter->off = off; + for (j = 0; j < bin->n; ++j) { + if (bin->list[j].v > min_off && bin->list[j].u < max_off) { + iter->off[iter->n_off].u = min_off > bin->list[j].u + ? min_off : bin->list[j].u; + iter->off[iter->n_off].v = max_off < bin->list[j].v + ? max_off : bin->list[j].v; + // hts_pair64_max_t::max is now used to link + // file offsets to region list entries. + // The iterator can use this to decide if it + // can skip some file regions. + iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; + iter->n_off++; + } + } + return 0; +} + +static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + int l, t, s = min_shift + n_lvls * 3; + hts_pos_t b, e, i; + + for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + khint_t k = kh_get(bin, bidx, i); + if (k != kh_end(bidx)) { + bins_t *bin = &kh_value(bidx, k); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + } + return 0; +} + +static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + bins_t *bin = &kh_value(bidx, i); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + return 0; +} + static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; int i, j; - hts_pos_t b, e; - hts_pair64_max_t *off; + hts_pos_t end1; bidx_t *bidx; - khint_t k; - int start_n_off = iter->n_off; + int start_n_off; + size_t reg_bin_count = 0, hash_bin_count; + int res; if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) return -1; + hash_bin_count = kh_n_buckets(bidx); + s = min_shift + (n_lvls<<1) + n_lvls; if (end >= 1LL<>s); e = t + (end>>s); - - for (i = b; i <= e; ++i) { - if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) { - bins_t *p = &kh_value(bidx, k); + end1 = end - 1; + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } - if (p->n) { - off = realloc(iter->off, (iter->n_off + p->n) * sizeof(*off)); - if (!off) - return -2; + start_n_off = iter->n_off; - iter->off = off; - for (j = 0; j < p->n; ++j) { - if (p->list[j].v > min_off && p->list[j].u < max_off) { - iter->off[iter->n_off].u = min_off > p->list[j].u - ? min_off : p->list[j].u; - iter->off[iter->n_off].v = max_off < p->list[j].v - ? max_off : p->list[j].v; - // hts_pair64_max_t::max is now used to link - // file offsets to region list entries. - // The iterator can use this to decide if it - // can skip some file regions. - iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; - iter->n_off++; - } - } - } - } - } + // Populate iter->off with the intervals for this region + if (reg_bin_count < hash_bin_count) { + res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); + } else { + res = reg2intervals_wide(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); } + if (res < 0) + return res; if (iter->n_off - start_n_off > 1) { ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off); From ca51cc55e8ea1ead5d26aa4ede90b58c07e8d893 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Apr 2023 08:36:18 +0100 Subject: [PATCH 19/70] Fix typo in kh_int_hash_func2 macro. This was fixed upstream in attractivechaos/klib@384277a Fixes #1598 --- htslib/khash.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/khash.h b/htslib/khash.h index 7cd0c9b62..4cea91020 100644 --- a/htslib/khash.h +++ b/htslib/khash.h @@ -447,7 +447,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) key ^= (key >> 16); return key; } -#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)(key)) /* --- END OF HASH FUNCTIONS --- */ From fc2448c27362837758f5b777f25fc7d2fe420c82 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 27 Apr 2023 09:27:38 +0100 Subject: [PATCH 20/70] Switched back to openssl for Alpine. --- INSTALL | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index dd2c3ec90..804593443 100644 --- a/INSTALL +++ b/INSTALL @@ -266,7 +266,9 @@ Alpine Linux ------------ doas apk update # Ensure the package list is up to date -doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev + +Note: some older Alpine versions use libressl-dev rather than openssl-dev. OpenSUSE -------- From 279cc9ec8d3fd5ad94aa0df730dac6e9ec0ead8f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 2 May 2023 10:24:06 +1200 Subject: [PATCH 21/70] Mention in INSTALL that using plugins may need -rdynamic --- INSTALL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/INSTALL b/INSTALL index 804593443..e0fddd9d7 100644 --- a/INSTALL +++ b/INSTALL @@ -129,6 +129,9 @@ various features and specify further optional external requirements: any enabled pluggable facilities (such as libcurl file access) are built directly within HTSlib. + Programs that are statically linked to a libhts.a with plugins enabled + need to be linked using -rdynamic or a similar linker option. + The repository contains several additional plugins, including the iRODS () file access plugin previously distributed with HTSlib. From 415d2cea57feecdd904f84d8aa25390d96843630 Mon Sep 17 00:00:00 2001 From: kojix2 <2xijok@gmail.com> Date: Tue, 9 May 2023 00:07:46 +0900 Subject: [PATCH 22/70] Fix example in docs for sam_hdr_add_line --- htslib/sam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/sam.h b/htslib/sam.h index 514a6be04..a0b8438aa 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -485,7 +485,7 @@ int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len); /// Adds a single line to an existing header. /*! * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). + * Eg. sam_hdr_add_line(h, "SQ", "SN", "foo", "LN", "100", NULL). * * @param type Type of the added line. Eg. "SQ" * @return 0 on success, -1 on failure From 6125f1c56934745f3a4b81f6e512c11720849b55 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 2 May 2023 12:04:05 +0100 Subject: [PATCH 23/70] Rename aux tag MZ to MN. See https://github.com/samtools/hts-specs/pull/714 --- sam.c | 4 ++-- test/base_mods/{MM-MZf1.sam => MM-MNf1.sam} | 4 ++-- test/base_mods/{MM-MZf2.sam => MM-MNf2.sam} | 4 ++-- test/base_mods/{MM-MZp.sam => MM-MNp.sam} | 4 ++-- test/base_mods/MM-multi.sam | 2 +- test/base_mods/base-mods.tst | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) rename test/base_mods/{MM-MZf1.sam => MM-MNf1.sam} (84%) rename test/base_mods/{MM-MZf2.sam => MM-MNf2.sam} (84%) rename test/base_mods/{MM-MZp.sam => MM-MNp.sam} (84%) diff --git a/sam.c b/sam.c index 8f135c333..05910cf8a 100644 --- a/sam.c +++ b/sam.c @@ -6225,12 +6225,12 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { return -1; } - uint8_t *mi = bam_aux_get(b, "MZ"); + uint8_t *mi = bam_aux_get(b, "MN"); if (mi && bam_aux2i(mi) != b->core.l_qseq) { // bam_aux2i with set errno = EINVAL and return 0 if the tag // isn't integer, but 0 will be a seq-length mismatch anyway so // triggers an error here too. - hts_log_error("%s: MM/MZ data length is incompatible with" + hts_log_error("%s: MM/MN data length is incompatible with" " SEQ length", bam_get_qname(b)); return -1; } diff --git a/test/base_mods/MM-MZf1.sam b/test/base_mods/MM-MNf1.sam similarity index 84% rename from test/base_mods/MM-MZf1.sam rename to test/base_mods/MM-MNf1.sam index 35074fd05..f973d274c 100644 --- a/test/base_mods/MM-MZf1.sam +++ b/test/base_mods/MM-MNf1.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:37 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:37 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-MZf2.sam b/test/base_mods/MM-MNf2.sam similarity index 84% rename from test/base_mods/MM-MZf2.sam rename to test/base_mods/MM-MNf2.sam index 843f93a1b..a88924122 100644 --- a/test/base_mods/MM-MZf2.sam +++ b/test/base_mods/MM-MNf2.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:f:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:f:36 diff --git a/test/base_mods/MM-MZp.sam b/test/base_mods/MM-MNp.sam similarity index 84% rename from test/base_mods/MM-MZp.sam rename to test/base_mods/MM-MNp.sam index 836a09725..7bdca0f31 100644 --- a/test/base_mods/MM-MZp.sam +++ b/test/base_mods/MM-MNp.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam index 1c7288f50..ac2831bc1 100644 --- a/test/base_mods/MM-multi.sam +++ b/test/base_mods/MM-multi.sam @@ -3,5 +3,5 @@ @CO r2 has them combined together, for example as produced by @CO a joint basecaller which assigns probabilities to all @CO trained events simultaneously. -r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index d246223c8..237f7906c 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -45,6 +45,6 @@ P MM-pileup2.out $pileup_mod < MM-pileup2.sam # Validation testing. We just care about exit status here, but the # test data is a copy of MM-pileup.sam so that suffices too. -P MM-pileup.out $pileup_mod < MM-MZp.sam -F MM-pileup.out $pileup_mod < MM-MZf1.sam -F MM-pileup.out $pileup_mod < MM-MZf2.sam +P MM-pileup.out $pileup_mod < MM-MNp.sam +F MM-pileup.out $pileup_mod < MM-MNf1.sam +F MM-pileup.out $pileup_mod < MM-MNf2.sam From f2d17a7d2c96494337839d257b0f08f28ddbc6fa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 May 2023 15:58:46 +0100 Subject: [PATCH 24/70] Protect against overly large containers. It's possible to construct CRAM containers that are extremely large such that building a block of BAM records representing the container overflows due to the size of the combined aux fields. We could change how we construct blocks of data, and work at a more individual read level, but realistically it's just not good form to be handling arbitrarily large containers as they may cause excessive memory issues which brings its own denial attacks. POTENTIAL SECURITY ISSUE: Note the previous could overflow cr->aux, which then went negative and caused negative offsets to be passed to memcpy. This would lead to a crash. I cannot see a way to get this to not crash and hence leak data, but it could form a denial of service on a remote server using htslib. . --- cram/cram_decode.c | 7 +++++++ cram/cram_structs.h | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 39869cbdd..47b7ed076 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2059,6 +2059,13 @@ static int cram_decode_aux(cram_fd *fd, *has_NM = 1; } } + + // We could go to 2^32 fine, but we shouldn't be hitting this anyway, + // and it's protecting against memory hogs too. + if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) { + hts_log_error("CRAM->BAM aux block size overflow"); + goto block_err; + } } return r; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 0a66d51b9..8b21d29c0 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -500,8 +500,8 @@ typedef struct cram_record { // Auxiliary data int32_t ntags; // TC - int32_t aux; // idx to s->aux_blk - int32_t aux_size; // total size of packed ntags in aux_blk + uint32_t aux; // idx to s->aux_blk + uint32_t aux_size; // total size of packed ntags in aux_blk #ifndef TN_external int32_t TN_idx; // TN; idx to s->TN; #else @@ -509,15 +509,15 @@ typedef struct cram_record { #endif int TL; - int32_t seq; // idx to s->seqs_blk - int32_t qual; // idx to s->qual_blk - int32_t cigar; // idx to s->cigar + uint32_t seq; // idx to s->seqs_blk + uint32_t qual; // idx to s->qual_blk + uint32_t cigar; // idx to s->cigar int32_t ncigar; int64_t aend; // alignment end int32_t mqual; // MQ - int32_t feature; // idx to s->feature - int32_t nfeature; // number of features + uint32_t feature; // idx to s->feature + uint32_t nfeature; // number of features int32_t mate_flags; // MF } cram_record; @@ -623,8 +623,8 @@ struct cram_slice { uint32_t ncigar; cram_feature *features; - int nfeatures; - int afeatures; // allocated size of features + uint32_t nfeatures; + uint32_t afeatures; // allocated size of features #ifndef TN_external // TN field (Tag Name) From f3ad960fa36e263684fd0822a14cb72e8f1b2d5c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 May 2023 16:02:52 +0100 Subject: [PATCH 25/70] Don't create overly large CRAM blocks. Currently CRAM containers can in some circumstance become huge. To prevent this we currently have a limit of the number of sequences (default 10,000) and also by number of bases (default 500 * number of seqs) so long-read technologies don't put too much in a container. However if we have 10k of reads with jointly under 5Mb of sequence that also have over 2GB worth of aux data, then we can trigger the overflow fixed in the previous commit. How do we get >430 bytes worth of aux for every base and >214Kb of aux for every read, in real world data rather than in deliberate stress testing? One possibility is with SEQ "*" (eg secondary alignments from minimap2) on very long-read data with heavy aux tag usage, as this doesn't increase base count at all. The same issue occurs to a lesser extent which supplementaries and hard-clipping. We now create new containers when seq+aux goes beyond the specified limit instead of just seq. In normal circumstances this will have a limited effect. Thanks to Martin Pollard for triggering and reporting this corner case. --- cram/cram_encode.c | 5 +++-- cram/cram_structs.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 5b56aedd5..9797fa7a8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3852,7 +3852,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || - (c->s_num_bases >= fd->bases_per_slice)) { + (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) { int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; int curr_ref = c->slice ? c->curr_ref : bam_ref(b); @@ -3885,7 +3885,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (CRAM_MAJOR_VERS(fd->version) == 1 || c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice || - c->s_num_bases >= fd->bases_per_slice) { + c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush @@ -3997,6 +3997,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->curr_rec++; c->curr_c_rec++; c->s_num_bases += bam_seq_len(b); + c->s_aux_bytes += bam_get_l_aux(b); c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1; fd->record_counter++; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 8b21d29c0..15b7f145b 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -473,6 +473,7 @@ struct cram_container { uint32_t crc32; // CRC32 uint64_t s_num_bases; // number of bases in this slice + uint64_t s_aux_bytes; // number of bytes of aux in BAM uint32_t n_mapped; // Number of mapped reads int ref_free; // whether 'ref' is owned by us and must be freed. From c04f61e9b5747763dcf31c4e26eb3739c16e58a5 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 15 May 2023 14:21:28 +0100 Subject: [PATCH 26/70] Add a missing break statement in cram_codec_to_id. (#1614) Note: This code is currently not used except in the experimental CRAMv4.0. --- cram/cram_codecs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 21240c141..cc5e52b2c 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3910,7 +3910,8 @@ int cram_codec_to_id(cram_codec *c, int *id2) { switch (c->codec) { case E_CONST_INT: case E_CONST_BYTE: - bnum1 = -2; // no blocks used + bnum1 = -2; // no blocks used + break; case E_HUFFMAN: bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; From e13611a942095dfea0944fe28836934c17c2ef6c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 11 Apr 2023 12:04:21 +0100 Subject: [PATCH 27/70] Fix fd_seek on pipes on modern MinGW releases. MinGW 12.x started returning non-zero values from lseek when the fd is a pipe. This is unhelpful and it breaks bgzf_check_EOF as seeking to the end is actually seeking to the end of the pipe memory buffer, causing invalid EOFs. (This breaks bcftools CI tests.) Fixes samtools/bcftools#1901 Co-authored-by: John Marshall --- hfile.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hfile.c b/hfile.c index ebb5b2244..f8d42e49a 100644 --- a/hfile.c +++ b/hfile.c @@ -564,6 +564,16 @@ static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes) static off_t fd_seek(hFILE *fpv, off_t offset, int whence) { hFILE_fd *fp = (hFILE_fd *) fpv; +#ifdef _WIN32 + // On windows lseek can return non-zero values even on a pipe. Instead + // it's likely to seek somewhere within the pipe memory buffer. + // This breaks bgzf_check_EOF among other things. + if (GetFileType((HANDLE)_get_osfhandle(fp->fd)) == FILE_TYPE_PIPE) { + errno = ESPIPE; + return -1; + } +#endif + return lseek(fp->fd, offset, whence); } From 878cff4a443fe5b3b49aae61953de1bbddeddc8a Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 23 May 2023 15:36:17 +0100 Subject: [PATCH 28/70] Amalgamate multiple CIGAR ops into single entry. (#1607) Amalgamate multiple CIGAR ops into single entry. Multiple matching (or sequence (mis)matching)) ops (e.g. 10M40M) give a different VCF using BAQ than a single operation of the same length (e.g. 50M). This change compresses the multiple operations into one. --- realn.c | 22 +++++++++++++++++++++- sam.c | 2 +- test/realn03.fa | 2 ++ test/realn03.fa.fai | 1 + test/realn03.sam | 4 ++++ test/realn03_exp.sam | 4 ++++ test/test.pl | 3 +++ 7 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 test/realn03.fa create mode 100644 test/realn03.fa.fai create mode 100644 test/realn03.sam create mode 100644 test/realn03_exp.sam diff --git a/realn.c b/realn.c index 5354dee78..d7e8255f8 100644 --- a/realn.c +++ b/realn.c @@ -1,6 +1,6 @@ /* realn.c -- BAQ calculation and realignment. - Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2016, 2018, 2021, 2023 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -268,8 +268,28 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { // tseq,tref are no longer needed, so we can steal them to avoid mallocs uint8_t *left = tseq; uint8_t *rght = tref; + int len = 0; + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; + + // concatenate alignment matches (including sequence (mis)matches) + // otherwise 50M50M gives a different result to 100M + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if ((k + 1) < c->n_cigar) { + int next_op = bam_cigar_op(cigar[k + 1]); + + if (next_op == BAM_CMATCH || next_op == BAM_CEQUAL || next_op == BAM_CDIFF) { + len += l; + continue; + } + } + + // last of M/X/= ops + l += len; + len = 0; + } + if (l == 0) continue; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { // Sanity check running off the end of the sequence diff --git a/sam.c b/sam.c index 05910cf8a..e1cc2988a 100644 --- a/sam.c +++ b/sam.c @@ -5227,7 +5227,7 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) uint32_t *cigar = bam_get_cigar(b); int k; // determine the current CIGAR operation - //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); + //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); if (s->k == -1) { // never processed p->qpos = 0; if (c->n_cigar == 1) { // just one operation, save a loop diff --git a/test/realn03.fa b/test/realn03.fa new file mode 100644 index 000000000..9ac86c11e --- /dev/null +++ b/test/realn03.fa @@ -0,0 +1,2 @@ +>MX +CGTCTACTACG diff --git a/test/realn03.fa.fai b/test/realn03.fa.fai new file mode 100644 index 000000000..dfaa59355 --- /dev/null +++ b/test/realn03.fa.fai @@ -0,0 +1 @@ +MX 11 4 11 12 diff --git a/test/realn03.sam b/test/realn03.sam new file mode 100644 index 000000000..50266242d --- /dev/null +++ b/test/realn03.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII diff --git a/test/realn03_exp.sam b/test/realn03_exp.sam new file mode 100644 index 000000000..3b608d0c6 --- /dev/null +++ b/test/realn03_exp.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D diff --git a/test/test.pl b/test/test.pl index 1595557a2..368bd4f18 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1099,6 +1099,9 @@ sub test_realn { # Revert quality values (using data in ZQ tags) test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam"); + + # Make sure multiple matches are treated the same way as a single match of the same length. + test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn03.fa -e -i $$opts{path}/realn03.sam -o -", out => "realn03_exp.sam"); } sub test_bcf_set_variant_type From 20072b3ed83e961da8a66120368ac591b569bc0e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 26 May 2023 15:26:00 +0100 Subject: [PATCH 29/70] Update to latest htscodecs - Speed up fqz_qual_stats function - Speed up fqcomp encoding through memory prefetching - Optimise fqzcomp decoder - Remove prefetching from c_simple_mode.h - Autoconf improvements - Updates for 1.5.0 release - Only use the ARM NEON 32-way unrolled rANS on AArch64. - Add include of config.h to test programs. - Add FreeBSD to the CI tests - On AMD, don't always use AVX512 in preference to AVX2. - Add a -b option to change block size for rans4x16 test. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index d4aed5859..109f06949 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d4aed585929e2dab9dd8e6a2b74484dfc347c0f2 +Subproject commit 109f069490fca15d85e2d261822c15bc3080db8a From 334c76adaaa2c6c916b9326f3da6ab4981a07c7d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 26 May 2023 15:53:57 +0100 Subject: [PATCH 30/70] Don't set _POSIX_C_SOURCE for htscodecs tests This caused problems on freebsd, and is no longer needed now the test sources include config.h --- Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 9b7f7f2f4..0cfec6e6b 100644 --- a/Makefile +++ b/Makefile @@ -732,17 +732,17 @@ htscodecs/tests/tokenise_name3: htscodecs/tests/tokenise_name3_test.o $(HTSCODEC htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread -htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) -htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) -htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) -htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) -htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) -htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) From abb4c730c6c283018f42500842c734d67deaf619 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 18 Apr 2023 07:15:00 +0100 Subject: [PATCH 31/70] Set _XOPEN_SOURCE in configure if it's not already set HTSlib uses POSIX and XSI functions, so to be strictly conforming POSIX says it should define _XOPEN_SOURCE before any standard headers are included. Some system headers may use this to enable or disable declarations for these functions. Define it unconditionally in configure, unless it has already been set by the user via CPPFLAGS etc. This mirrors what happens if you build HTSlib by typing "make" without running configure first. As it is now set by default, other locations where _XOPEN_SOURCE may be altered are removed. Co-authored-by: Rob Davies --- configure.ac | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index ff2367c1b..b18760da5 100644 --- a/configure.ac +++ b/configure.ac @@ -70,6 +70,19 @@ dnl Flags to treat warnings as errors. These need to be applied to CFLAGS dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) +# HTSlib uses X/Open-only facilities (M_SQRT2 etc, drand48() etc), and +# various POSIX functions that are provided by various _POSIX_C_SOURCE values +# or by _XOPEN_SOURCE >= 500. It also uses usleep(), which is removed when +# _XOPEN_SOURCE >= 700. Additionally, some definitions may require +# _XOPEN_SOURCE >= 600 on some platforms (snprintf on MinGW, +# PTHREAD_MUTEX_RECURSIVE on some Linux distributions). Hence we set it to 600. + +# Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc. +AC_CHECK_DECL([_XOPEN_SOURCE], [], + [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])], + []) + + dnl Check for various compiler flags to enable SIMD features dnl Options for rANS32x16 sse4.1 version - ssse3 hts_cflags_sse4="" @@ -263,7 +276,9 @@ case $basic_host in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" + + # Now set by default, so no need to do it here. + # CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; *) host_result="plain .so" @@ -583,7 +598,8 @@ AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) dnl Look for PTHREAD_MUTEX_RECURSIVE. dnl This is normally in pthread.h except on some broken glibc implementations. -AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) +dnl Now set by default +dnl AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) if test "$s3" = enabled ; then AC_DEFINE([ENABLE_S3], 1, [Define if HTSlib should enable S3 support.]) From ccf7c990c8297ca7089619b629990c79bc32b8ef Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 May 2023 14:35:03 +0100 Subject: [PATCH 32/70] Fix trailing space in config.h made by configure To stop check_spaces.pl from tripping up on it. --- configure.ac | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index b18760da5..3230f3935 100644 --- a/configure.ac +++ b/configure.ac @@ -136,8 +136,7 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs /* Prevent unaligned access in htscodecs SSE4 rANS codec */ #if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0 #undef UBSAN -#endif - ]) +#endif]) AC_DEFINE([UBSAN],1,[]) ]) AC_SUBST([hts_cflags_sse4]) From e8f773b0e3f82157107ac6eb8d1fb17b2aae5d17 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 2 Jun 2023 11:58:25 +0100 Subject: [PATCH 33/70] Ignore generated config_vars.h file in copyright check --- test/maintainer/check_copyright.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 43fb5be4e..608182085 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -49,6 +49,7 @@ sub check { # Exclusions: my %exclude = map { ("$root/$_", 1) } ( 'config.h', # Auto-generated +'config_vars.h', # Auto-generated 'version.h', # Auto-generated 'cram/rANS_byte.h', # "Public domain" 'os/lzma_stub.h', # "Public domain" From 9936a58154e145f2761db5c6816d700cfb4f3d31 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 May 2023 14:48:59 +0100 Subject: [PATCH 34/70] Switch to `/usr/bin/env perl` for all perl scripts Some of them were using it already, and it's slightly more portable on platforms (mainly BSDs) that don't put perl in /usr/bin/. Happily most of them do put env in the same place these days... --- test/compare_sam.pl | 3 ++- test/maintainer/check_copyright.pl | 2 +- test/maintainer/check_spaces.pl | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/compare_sam.pl b/test/compare_sam.pl index 23b67d3a4..499cb2390 100755 --- a/test/compare_sam.pl +++ b/test/compare_sam.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # Copyright (C) 2013-2018 Genome Research Ltd. # @@ -26,6 +26,7 @@ # Optionally can skip header or ignore specific types of diff. use strict; +use warnings; use Getopt::Long; my %opts; diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 608182085..22556df01 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_copyright.pl : Basic source file checks for copyright boilerplate # # Author : Rob Davies diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index 81b4ededc..e48518f25 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_spaces.pl : Check source files for tabs and trailing spaces # # Author : Rob Davies From 90af5ee56f84779d27a03fa2aaa59a6bd4ee90cf Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 1 Jun 2023 10:31:07 +0100 Subject: [PATCH 35/70] Stop the overwriting of the end value. If the end column appeared before the begin column then the end value was being overwritten in begin value initialisation. --- tbx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tbx.c b/tbx.c index d897a21f1..154e2a81b 100644 --- a/tbx.c +++ b/tbx.c @@ -103,10 +103,18 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtoll(line + b, &s, 0); + intv->beg = strtoll(line + b, &s, 0); + + if (conf->bc <= conf->ec) // don't overwrite an already set end point + intv->end = intv->beg; + if ( s==line+b ) return -1; // expected int - if (!(conf->preset&TBX_UCSC)) --intv->beg; - else ++intv->end; + + if (!(conf->preset&TBX_UCSC)) + --intv->beg; + else if (conf->bc <= conf->ec) + ++intv->end; + if (intv->beg < 0) { hts_log_warning("Coordinate <= 0 detected. " "Did you forget to use the -0 option?"); From f613a93feb5f323f08ccfe53549dad8784580e17 Mon Sep 17 00:00:00 2001 From: vasudeva8 <113358286+vasudeva8@users.noreply.github.com> Date: Fri, 9 Jun 2023 11:29:26 +0100 Subject: [PATCH 36/70] Ensure NUL termination of Z/H data in sam_format_aux1; fix base mod state reuse Ensure NUL termination of Z/H data in sam_format_aux1 Avoid failure to get base modifications when reusing the hts_base_mod_state struct. Minor comment updates --- htslib/sam.h | 6 ++++-- sam.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index a0b8438aa..fe5b1ebdc 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -1414,7 +1414,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, /** @param fp Pointer to the destination file * @param h Pointer to the header structure previously read * @param b Pointer to the record to be written - * @return >= 0 on successfully writing the record, -1 on error + * @return >= 0 on successfully writing the record, -ve on error */ HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; @@ -1519,6 +1519,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, r |= kputc_(type, ks) < 0; r |= kputc_(':', ks) < 0; while (s < end && *s) r |= kputc_(*s++, ks) < 0; + r |= kputsn("", 0, ks) < 0; //ensures NUL termination if (s >= end) goto bad_aux; ++s; @@ -2286,6 +2287,7 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, * @param state The base modification state pointer. * @param mods A supplied array for returning base modifications * @param n_mods The size of the mods array + * @param pos Pointer holding position of modification in sequence * @return The number of modifications found on success, * 0 if no more modifications are present, * -1 on failure. diff --git a/sam.c b/sam.c index e1cc2988a..d77ac7f9d 100644 --- a/sam.c +++ b/sam.c @@ -6215,6 +6215,10 @@ static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; * */ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + + //reset position, else upcoming calls may fail on seq pos - length comparison + state->seq_pos = 0; + // Read MM and ML tags uint8_t *mm = bam_aux_get(b, "MM"); if (!mm) mm = bam_aux_get(b, "Mm"); @@ -6244,8 +6248,6 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; if (ml) ml += 6; - state->seq_pos = 0; - // Aggregate freqs of ACGTN if reversed, to get final-delta (later) int freq[16]; if (b->core.flag & BAM_FREVERSE) From 5e0ccef5f3c2864dbd7a966b55ca06282fa3b2c8 Mon Sep 17 00:00:00 2001 From: vasudeva8 <113358286+vasudeva8@users.noreply.github.com> Date: Fri, 9 Jun 2023 11:43:05 +0100 Subject: [PATCH 37/70] Changes to avoid segfault with uncompressed bam (PR #1632) Avoids segfault when writing bam/bcf with mode "wbu" by changing "wbu" to "wb0". The ensures the output file will be properly wrapped in BGZF blocks, even though it's not been compressed. Fixes #1617 --- hts.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hts.c b/hts.c index b3f6b9a08..f2bc5fcb6 100644 --- a/hts.c +++ b/hts.c @@ -835,7 +835,7 @@ char *hts_format_description(const htsFormat *format) htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) { - char smode[101], *cp, *cp2, *mode_c; + char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL; htsFile *fp = NULL; hFILE *hfile = NULL; char fmt_code = '\0'; @@ -853,8 +853,13 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) fmt_code = 'b'; else if (*cp == 'c') fmt_code = 'c'; - else + else { *cp2++ = *cp; + // Cache the uncompress flag 'u' pos if present + if (!uncomp && (*cp == 'u')) { + uncomp = cp2 - 1; + } + } } mode_c = cp2; *cp2++ = fmt_code; @@ -866,6 +871,11 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) *mode_c = format_to_mode[fmt->format]; } + // Uncompressed bam/bcf is not supported, change 'u' to '0' on write + if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) { + *uncomp = '0'; + } + // If we really asked for a compressed text format then mode_c above will // point to nul. We set to 'z' to enable bgzf. if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) { From 7f69840c2fbf73dc7601c17a3ade4db676858cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sun, 11 Jun 2023 18:50:59 +0200 Subject: [PATCH 38/70] cram/cram_external.c: fix external htscodecs include This patch fixes a duplicate file extension, causing otherwise the following build failure when building htslib against an externally built htscodecs: gcc -g -O2 -ffile-prefix-map=/<>=. -fstack-protector-strong -Wformat -Werror=format-security -ffat-lto-objects -ffat-lto-objects -I. -I. -DSAMTOOLS=1 -Wdate-time -D_FORTIFY_SOURCE=2 -c -o cram/cram_external.o cram/cram_external.c cram/cram_external.c:46:10: fatal error: htscodecs/rANS_static4x16.h.h: No such file or directory 46 | #include | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- cram/cram_external.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_external.c b/cram/cram_external.c index 26ef3d7d3..7455185ad 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -43,7 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(HAVE_EXTERNAL_LIBHTSCODECS) -#include +#include #else #include "../htscodecs/htscodecs/rANS_static4x16.h" #endif From 79b3a42db9059537318d85876623860d212ab159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sun, 11 Jun 2023 19:49:42 +0200 Subject: [PATCH 39/70] htslib-s3-plugin.7: fix whatis entry When preparing the htslib 1.17 upload in Debian, lintian caught a bad-whatis-entry issue. Looking closer, whatis(1) and apropos(1) commands fail to locate htslib-s3-plugin(7) manual and the parsing of the page fails with: $ lexgrog htslib-s3-plugin.7 htslib-s3-plugin.7: parse failed It seems to stem from the two words "s3 plugin" in the name of the manual, instead of having a single word as needed. This change names the manual page "htslib-s3-plugin" instead, making the mandb, the lexgrog parser, and lintian happy: $ lexgrog htslib-s3-plugin.7 htslib-s3-plugin.7: "htslib-s3-plugin - htslib AWS S3 plugin" The name could also simply be "s3-plugin", but having htslib in the name felt more apropos. :) --- htslib-s3-plugin.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index b37eacac3..eee6fb27c 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,6 +1,6 @@ .TH htslib-s3-plugin 7 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME -s3 plugin \- htslib AWS S3 plugin +htslib-s3-plugin \- htslib AWS S3 plugin .\" .\" Copyright (C) 2021-2022 Genome Research Ltd. .\" From c11aebe327d4a850fb9cb0af82655539ec768a68 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 21 Jun 2023 21:26:33 +0200 Subject: [PATCH 40/70] Allow repeated calls of bcf_sr_set_regions (PR #1624) and make repeated bcf_sr_seek()+next_line() calls consistent. Resolves #1623 and https://github.com/samtools/bcftools/issues/1918 --- htslib/synced_bcf_reader.h | 8 +++++--- synced_bcf_reader.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 78e9a0b4a..bbe5ea2ba 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd. Author: Petr Danecek @@ -306,8 +306,10 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. * - * API note: bcf_sr_set_regions/bcf_sr_set_targets MUST be called before the - * first call to bcf_sr_add_reader(). + * API notes: + * - bcf_sr_set_targets MUST be called before the first call to bcf_sr_add_reader() + * - calling bcf_sr_set_regions AFTER readers have been initialized will + * reposition the readers and discard all previous regions. */ HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 23e0ecaef..702f260ee 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -76,6 +76,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); +static void bcf_sr_seek_start(bcf_srs_t *readers); char *bcf_sr_strerror(int errnum) { @@ -187,8 +188,10 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { if ( readers->nreaders || readers->regions ) { - hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()"); - return -1; + if ( readers->regions ) bcf_sr_regions_destroy(readers->regions); + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + bcf_sr_seek_start(readers); + return 0; } readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); @@ -676,7 +679,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); exit(1); } - if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -843,7 +845,11 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) for (i=0; inseqs; i++) reg->regs[i].creg = -1; reg->iseq = 0; + reg->start = -1; + reg->end = -1; reg->prev_seq = -1; + reg->prev_start = -1; + reg->prev_end = -1; } From 7de2df29cb1c9f6b807b8e4b8bc09a1313303db3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 4 May 2023 12:22:12 +0100 Subject: [PATCH 41/70] Change bounds checking in probaln_glocal In 3 places when filling out forwards and backwards arrays, the "u" array index has bounds checks of "u < 3 || u >= i_dim-3". Understanding this code is tricky however! My hypothesis that the upper bounds check here is because we use u, u+1 and u+2 in array indices, and we iterate with "k <= l_ref" so we can access one beyond the end of the array. However the arrays are allocated to be dimension (l_query+1)*i_dim, so (assuming correctness of l_ref vs l_query in bw/i_dim calculation) we have compensated for this over-step already. This has been validated with address sanitiser. The effect of the i_dim-3 limit is that having band width equal to query length causes the final state element to be incorrectly labelled as an insertion. This hypothesis may however be incorrect, as the lower bound "u < 3" also seems redundant, yet changing this to "u < 0" does give different quality scores in about 1 in 4000 sequences (tested on 10 million illumina short read BAQ calculations). Hence for now this is left unchanged. In normal behaviour using a band, tested using "samtools calmd -r -E" to generate BQ tags, this commit does not change output. Fixes #1605 --- probaln.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/probaln.c b/probaln.c index 192f4b751..8a60372b3 100644 --- a/probaln.c +++ b/probaln.c @@ -245,10 +245,24 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, { // f[l_query+1] double sum; double M = 1./s[l_query]; + // Note this goes up to <= l_ref, meaning we are accessing 1 beyond + // the end of the sequence. However we allocated above with + // (l_query+1)*i_dim (plus appropriate l_ref vs l_query in band width) + // so this should be sufficient. + // + // This fixes Issue #1605 where band width equal to sequence length + // gives incorrect alignments, due to the last value not being filled + // out correctly. + // + // I am unsure why the limit was previously set at u >= i_dim - 3, but + // can only conjecture it was due to forgetting the l_query+1 alloc. + // I am also unsure why "u < 3" is used instead of "u < 0", however + // changing that does change behaviour for common usage (unlike + // "idim - 3" to "idim"). for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += M*f[l_query*i_dim + u+0] * sM + M*f[l_query*i_dim + u+1] * sI; } s[l_query+1] = sum; // the last scaling factor @@ -272,7 +286,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double *bi = &b[l_query*i_dim]; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; } // b[l_query-1..1] @@ -350,7 +364,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double e = (ref[k - 1] > 3 || query[0] > 3)? 1. : ref[k - 1] == query[0]? 1. - qual[0] : qual[0] * EM; set_u(u, bw, 1, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += e * b[1*i_dim + u+0] * bM + EI * b[1*i_dim + u+1] * bI; } set_u(k, bw, 0, 0); From b52f3fad9f0340ce29f098d048f77e2d8f991e30 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 13 Jun 2023 17:37:10 +0100 Subject: [PATCH 42/70] Adjust comments in probaln_glocal() Adds a comment explaining that the f[] and b[] arrays count positions from 1, allowing 0 to be used to more easily handle the edges of the alignment matrix. Changes the comment explaining the line: if (u < 3 || u >= i_dim) continue; used in some of the loops over f[] and b[]. While it does prevent overstepping the array boundaries, its main function is to select the parts over which the scores have previously been calculated. A change in 5d7a7823 to fix excess memory usage got the high end slightly wrong (using i_dim - 3). When the query sequence length was less than the band width, this could lead to the last column being incorrectly missed out from parts of the calculation. --- probaln.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/probaln.c b/probaln.c index 8a60372b3..c841c7522 100644 --- a/probaln.c +++ b/probaln.c @@ -140,6 +140,13 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, bM = (1 - c->d) / l_ref; // (bM+bI)*l_ref==1 bI = c->d / l_ref; + // f[] and b[] are 2-d arrays of three scores, with rows along the + // query and columns across the band. The first query base and + // first band position appear at index 1 allowing edge conditions + // to be stored in index 0. Hence the loops below appear to use + // 1-based indexing instead of 0-based as you'd normally expect in C, + // and the sequences are accessed using query[i - 1] and ref[k - 1]. + /*** forward ***/ // f[0] set_u(k, bw, 0, 0); @@ -245,20 +252,19 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, { // f[l_query+1] double sum; double M = 1./s[l_query]; - // Note this goes up to <= l_ref, meaning we are accessing 1 beyond - // the end of the sequence. However we allocated above with - // (l_query+1)*i_dim (plus appropriate l_ref vs l_query in band width) - // so this should be sufficient. - // - // This fixes Issue #1605 where band width equal to sequence length - // gives incorrect alignments, due to the last value not being filled - // out correctly. - // - // I am unsure why the limit was previously set at u >= i_dim - 3, but - // can only conjecture it was due to forgetting the l_query+1 alloc. - // I am also unsure why "u < 3" is used instead of "u < 0", however - // changing that does change behaviour for common usage (unlike - // "idim - 3" to "idim"). + // Note that this goes from 1 to l_ref inclusive, but as the + // alignment is banded not all of the values will have been + // calculated (the rest are taken as 0), so the summation + // actually goes over the values set in the last iteration of + // the previous loop (when i = l_query). For some reason lost to + // time this is done by looking for valid values of 'u' instead of + // working out 'beg' and 'end'. + + // From HTSlib 1.8 to 1.17, the endpoint was incorrectly set + // to i_dim - 3. When l_query <= bandwidth, this caused the last + // column to be missed, and if l_ref == l_query then a match at the end + // could incorrectly be reported as an insertion. See #1605. + for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); From c3a6fcdafecf320f65fd80acb1616f2fe81f1273 Mon Sep 17 00:00:00 2001 From: pd3 Date: Mon, 5 Jun 2023 15:19:04 +0200 Subject: [PATCH 43/70] Add support for non-standard chromosome names containing [:-] characters Note hts_parse_region() cannot be used because it requires the header and without the header the caller does not learn the contig name. Resolves #1620 --- htslib/synced_bcf_reader.h | 2 ++ synced_bcf_reader.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index bbe5ea2ba..9a6b48438 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -338,6 +338,8 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 702f260ee..d24e5f444 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1032,6 +1032,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) } // File name or a list of genomic locations. If file name, NULL is returned. +// Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. +// Cannot use hts_parse_region() as that requires the header and if header is not present, +// wouldn't learn the chromosome name. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); @@ -1043,9 +1046,23 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) hts_pos_t from, to; while ( 1 ) { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; - kputsn(sp,ep-sp,&tmp); + if ( *ep=='{' ) + { + while ( *ep && *ep!='}' ) ep++; + if ( !*ep ) + { + hts_log_error("Could not parse the region, mismatching braces in: \"%s\"", str); + goto exit_nicely; + } + ep++; + kputsn(sp+1,ep-sp-2,&tmp); + } + else + { + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); + } if ( *ep==':' ) { sp = ep+1; @@ -1053,7 +1070,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( sp==ep ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( !*ep || *ep==',' ) { @@ -1064,7 +1081,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep!='-' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } ep++; sp = ep; @@ -1072,7 +1089,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep && *ep!=',' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); @@ -1088,6 +1105,11 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } free(tmp.s); return reg; + +exit_nicely: + bcf_sr_regions_destroy(reg); + free(tmp.s); + return NULL; } // ichr,ifrom,ito are 0-based; From 3c36c9b9562966ddc00d487447a42c622323ec8b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Jun 2023 10:18:45 +0100 Subject: [PATCH 44/70] An attempt to parse malformatted region such as {1:1}-2 should fail --- synced_bcf_reader.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index d24e5f444..a43ab15ae 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1096,12 +1096,17 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( !*ep ) break; sp = ep; } - else + else if ( !*ep || *ep==',' ) { if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); if ( !*ep ) break; sp = ++ep; } + else + { + hts_log_error("Could not parse the region(s): %s", str); + goto exit_nicely; + } } free(tmp.s); return reg; From 28a8082c096b8ecdc79ddbc33a032c63854c7186 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 19 Jun 2023 16:25:54 +0100 Subject: [PATCH 45/70] Expand test-bcf-sr.c capabilities Add -O,--output-fmt option so it can write vcf or bcf as well as its original summary format. Add -o,--output option so it's possible to write to a file without shell redirection. Add --args option so input files can be listed directly on the command line instead of via a fofn, to make basic tests easier. Add -r,--regions and -t,--targets options, which behave the same as the equivalents in `bcftools view`. Add the --no-index option to the usage text. Simplify writing the original format. Everything can be sent directly to the output file without going via a kstring. The output writing parts are also moved into separate functions to keep main() from getting too big. Add a few extra error checks. Call exit(EXIT_FAILURE) on failure, not exit(-1). Make the -h option return success. --- test/test-bcf-sr.c | 201 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 159 insertions(+), 42 deletions(-) diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ee64afe41..e91af7f28 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -28,12 +28,17 @@ #include +#include #include #include #include #include +#include +#include #include "../htslib/synced_bcf_reader.h" +#include "../htslib/hts.h" +#include "../htslib/vcf.h" void error(const char *format, ...) { @@ -41,16 +46,78 @@ void error(const char *format, ...) va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); - exit(-1); + exit(EXIT_FAILURE); } -void usage(void) +void usage(int exit_code) { fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n"); + fprintf(stderr, " test-bcf-sr [OPTIONS] -args file1.bcf [...]\n"); fprintf(stderr, "Options:\n"); + fprintf(stderr, " --args pass filenames directly in argument list\n"); + fprintf(stderr, " --no-index allow streaming\n"); + fprintf(stderr, " -o, --output output file (stdout if not set)\n"); + fprintf(stderr, " -O, --output-fmt fmt: vcf,bcf,summary\n"); fprintf(stderr, " -p, --pair logic: snps,indels,both,snps+ref,indels+ref,both+ref,exact,some,all\n"); + fprintf(stderr, " -r, --regions comma-separated list of regions\n"); + fprintf(stderr, " -t, --targets comma-separated list of targets\n"); fprintf(stderr, "\n"); - exit(-1); + exit(exit_code); +} + +void write_summary_format(bcf_srs_t *sr, FILE *out) +{ + int n, i, j; + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s:%"PRIhts_pos, + bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); + break; + } + + for (i=0; inreaders; i++) + { + fprintf(out, "\t"); + + if ( !bcf_sr_has_line(sr,i) ) + { + fprintf(out, "%s","-"); + continue; + } + + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s", rec->n_allele > 1 ? rec->d.allele[1] : "."); + for (j=2; jn_allele; j++) + { + fprintf(out, ",%s", rec->d.allele[j]); + } + } + fprintf(out, "\n"); + } +} + +void write_vcf_bcf_format(bcf_srs_t *sr, bcf_hdr_t *hdr, vcfFile *vcf_out, + const char *fmt_type) +{ + int i, n; + if (bcf_hdr_write(vcf_out, hdr) != 0) + error("Couldn't write %s header\n", fmt_type); + + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + if (vcf_write(vcf_out, hdr, rec) < 0) + error("vcf_write() failed\n"); + } + } } int main(int argc, char *argv[]) @@ -58,16 +125,31 @@ int main(int argc, char *argv[]) static struct option loptions[] = { {"help",no_argument,NULL,'h'}, + {"output-fmt",required_argument,NULL,'O'}, {"pair",required_argument,NULL,'p'}, + {"regions",required_argument,NULL,'r'}, + {"targets",required_argument,NULL,'t'}, {"no-index",no_argument,NULL,1000}, + {"args",no_argument,NULL,1001}, {NULL,0,NULL,0} }; - int c, pair = 0, use_index = 1; - while ((c = getopt_long(argc, argv, "p:h", loptions, NULL)) >= 0) + int c, pair = 0, use_index = 1, use_fofn = 1; + enum htsExactFormat out_fmt = text_format; // for original pos + alleles + const char *out_fn = NULL, *regions = NULL, *targets = NULL; + while ((c = getopt_long(argc, argv, "o:O:p:r:t:h", loptions, NULL)) >= 0) { switch (c) { + case 'o': + out_fn = optarg; + break; + case 'O': + if (!strcasecmp(optarg, "vcf")) out_fmt = vcf; + else if (!strcasecmp(optarg, "bcf")) out_fmt = bcf; + else if (!strcasecmp(optarg, "summary")) out_fmt = text_format; + else error("Unknown output format \"%s\"\n", optarg); + break; case 'p': if ( !strcmp(optarg,"snps") ) pair |= BCF_SR_PAIR_SNPS; else if ( !strcmp(optarg,"snp+ref") ) pair |= BCF_SR_PAIR_SNPS|BCF_SR_PAIR_SNP_REF; @@ -83,68 +165,103 @@ int main(int argc, char *argv[]) else if ( !strcmp(optarg,"exact") ) pair = BCF_SR_PAIR_EXACT; else error("The --pair logic \"%s\" not recognised.\n", optarg); break; + case 'r': + regions = optarg; + break; + case 't': + targets = optarg; + break; case 1000: use_index = 0; break; - default: usage(); + case 1001: + use_fofn = 0; + break; + case 'h': + usage(EXIT_SUCCESS); + default: usage(EXIT_FAILURE); } } if ( !pair ) pair = BCF_SR_PAIR_EXACT; - if ( optind == argc ) usage(); + if ( optind == argc ) usage(EXIT_FAILURE); - int i, j, n, nvcf; - char **vcf = hts_readlist(argv[optind], 1, &nvcf); - if ( !vcf ) error("Could not parse %s\n", argv[optind]); + int i, nvcf; + char **vcfs = NULL; + if (use_fofn) { + vcfs = hts_readlist(argv[optind], 1, &nvcf); + if ( !vcfs ) error("Could not parse %s\n", argv[optind]); + } else { + vcfs = &argv[optind]; + nvcf = argc - optind; + } bcf_srs_t *sr = bcf_sr_init(); + if (!sr) error("bcf_sr_init() failed\n"); bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, pair); if (use_index) { bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX); } else { bcf_sr_set_opt(sr, BCF_SR_ALLOW_NO_IDX); } - for (i=0; ierrnum)); - kstring_t str = {0,0,0}; - while ( (n=bcf_sr_next_line(sr)) ) + if (regions) { - for (i=0; inreaders; i++) + if (bcf_sr_set_regions(sr, regions, 0) != 0) + error("Failed to set regions\n"); + } + + if (targets) + { + if (bcf_sr_set_targets(sr, targets, 0, 0) != 0) + error("Failed to set targets\n"); + } + + for (i=0; ierrnum)); + + if (!sr->readers || sr->nreaders < 1) + error("No readers set, even though one was added\n"); + + if (out_fmt == text_format) { + FILE *out = stdout; + if (out_fn) { - if ( !bcf_sr_has_line(sr,i) ) continue; - bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%"PRIhts_pos, bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); - break; + out = fopen(out_fn, "w"); + if (!out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); } - - for (i=0; inreaders; i++) + write_summary_format(sr, out); + if (out_fn) { - printf("\t"); + if (fclose(out) != 0) + error("Error on closing %s : %s\n", + out_fn, strerror(errno)); + } + } else { + const char *fmt_type = out_fmt == vcf ? "VCF" : "BCF"; - if ( !bcf_sr_has_line(sr,i) ) - { - printf("%s","-"); - continue; - } + bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0); + if (!hdr) error("%s output, but don't have a header\n", fmt_type); - str.l = 0; - bcf1_t *rec = bcf_sr_get_line(sr, i); - kputs(rec->n_allele > 1 ? rec->d.allele[1] : ".", &str); - for (j=2; jn_allele; j++) - { - kputc(',', &str); - kputs(rec->d.allele[j], &str); - } - printf("%s",str.s); - } - printf("\n"); + if (!out_fn) { out_fn = "-"; } + vcfFile *vcf_out = vcf_open(out_fn, out_fmt == vcf ? "w" : "wb"); + if (!vcf_out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); + write_vcf_bcf_format(sr, hdr, vcf_out, fmt_type); + if (vcf_close(vcf_out) != 0) + error("Error on closing \"%s\"\n", out_fn); } - free(str.s); + if (sr->errnum) error("Synced reader error: %s\n", + bcf_sr_strerror(sr->errnum)); + bcf_sr_destroy(sr); - for (i=0; i Date: Tue, 20 Jun 2023 12:25:05 +0100 Subject: [PATCH 46/70] Add synced reader region tests, and move no-index tests Add some tests to exercise the --regions / --targets synced reader options. Currently this only includes tests for the chromosomes with [:-] characters in the name, but it could be expanded easily to do others. Test files have been borrowed from pull request samtools/bcftools#1938. Move the synced reader no-index tests from test-bcf-sr.pl to test.pl. The former isn't a good place for them as it gets called 10 times, but the no-index test only needs to run once. It also allows the code running the test to be simplified a bit. Also fix the exit code on test-bcf-sr.pl failure from -1 to 1. Co-authored-by: Petr Danecek --- test/bcf-sr/weird-chr-names.1.out | 9 +++ test/bcf-sr/weird-chr-names.2.out | 8 +++ test/bcf-sr/weird-chr-names.3.out | 9 +++ test/bcf-sr/weird-chr-names.4.out | 8 +++ test/bcf-sr/weird-chr-names.5.out | 9 +++ test/bcf-sr/weird-chr-names.6.out | 8 +++ test/bcf-sr/weird-chr-names.vcf | 12 ++++ test/test-bcf-sr.pl | 60 +--------------- test/test.pl | 109 ++++++++++++++++++++++++++++++ 9 files changed, 173 insertions(+), 59 deletions(-) create mode 100644 test/bcf-sr/weird-chr-names.1.out create mode 100644 test/bcf-sr/weird-chr-names.2.out create mode 100644 test/bcf-sr/weird-chr-names.3.out create mode 100644 test/bcf-sr/weird-chr-names.4.out create mode 100644 test/bcf-sr/weird-chr-names.5.out create mode 100644 test/bcf-sr/weird-chr-names.6.out create mode 100644 test/bcf-sr/weird-chr-names.vcf diff --git a/test/bcf-sr/weird-chr-names.1.out b/test/bcf-sr/weird-chr-names.1.out new file mode 100644 index 000000000..5705c7575 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.2.out b/test/bcf-sr/weird-chr-names.2.out new file mode 100644 index 000000000..980818a71 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.3.out b/test/bcf-sr/weird-chr-names.3.out new file mode 100644 index 000000000..5b3ac8e18 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.3.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . +1:1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.4.out b/test/bcf-sr/weird-chr-names.4.out new file mode 100644 index 000000000..0d9e274ab --- /dev/null +++ b/test/bcf-sr/weird-chr-names.4.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.5.out b/test/bcf-sr/weird-chr-names.5.out new file mode 100644 index 000000000..6cb41e14f --- /dev/null +++ b/test/bcf-sr/weird-chr-names.5.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.6.out b/test/bcf-sr/weird-chr-names.6.out new file mode 100644 index 000000000..a707ed85c --- /dev/null +++ b/test/bcf-sr/weird-chr-names.6.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.vcf b/test/bcf-sr/weird-chr-names.vcf new file mode 100644 index 000000000..c367be477 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . +1:1 1 . C T . . . +1:1 2 . C T . . . +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/test-bcf-sr.pl b/test/test-bcf-sr.pl index cd9859c14..2e290cb3a 100755 --- a/test/test-bcf-sr.pl +++ b/test/test-bcf-sr.pl @@ -34,7 +34,6 @@ my $opts = parse_params(); run_test($opts); -test_no_index($opts); exit; @@ -58,7 +57,7 @@ sub error " -v, --verbose \n", " -h, -?, --help This help message\n", "\n"; - exit -1; + exit 1; } sub parse_params { @@ -572,60 +571,3 @@ sub pairing_score } return (1<<(28+$min)) + $cnt; } - -sub test_no_index { - my ($opts) = @_; - - my $vcfdir = "$FindBin::Bin/bcf-sr"; - if ($^O =~ /^msys/) { - $vcfdir = `cygpath -w $vcfdir`; - $vcfdir =~ s/\r?\n//; - $vcfdir =~ s/\\/\\\\/g; - } - - # Positive test - open(my $fh, '>', "$$opts{tmp}/no_index_1.txt") - || error("$$opts{tmp}/no_index_1.txt : $!"); - print $fh "$vcfdir/merge.noidx.a.vcf\n"; - print $fh "$vcfdir/merge.noidx.b.vcf\n"; - print $fh "$vcfdir/merge.noidx.c.vcf\n"; - close($fh) || error("$$opts{tmp}/no_index_1.txt : $!"); - - my $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_1.txt > $$opts{tmp}/no_index_1.out 2> $$opts{tmp}/no_index_1.err"; - my ($ret) = _cmd($cmd); - if ($ret) { - error("The command failed [$ret]: $cmd\n"); - } - - if ($^O =~ /^msys/) { - cmd("diff --strip-trailing-cr $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } else { - cmd("cmp $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } - - # Check bad input detection - - my @bad_file_tests = (["out-of-order header", - ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], - ["out-of-order records", - ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], - ["out-of-order records", - ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); - my $count = 2; - foreach my $test_params (@bad_file_tests) { - my ($badness, $inputs) = @$test_params; - open($fh, '>', "$$opts{tmp}/no_index_$count.txt") - || error("$$opts{tmp}/no_index_$count.txt : $!"); - foreach my $input (@$inputs) { - print $fh "$vcfdir/$input\n"; - } - close($fh) || error("$$opts{tmp}/no_index_$count.txt : $!"); - - $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_$count.txt > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; - my ($ret) = _cmd($cmd); - if ($ret == 0) { - error("Failed to detect $badness: $cmd\n"); - } - $count++; - } -} diff --git a/test/test.pl b/test/test.pl index 368bd4f18..f8e94faa3 100755 --- a/test/test.pl +++ b/test/test.pl @@ -54,6 +54,8 @@ test_vcf_sweep($opts,out=>'test-vcf-sweep.out'); test_vcf_various($opts); test_bcf_sr_sort($opts); +test_bcf_sr_no_index($opts); +test_bcf_sr_range($opts); test_command($opts,cmd=>'test-bcf-translate -',out=>'test-bcf-translate.out'); test_convert_padded_header($opts); test_rebgzip($opts); @@ -1033,6 +1035,113 @@ sub test_bcf_sr_sort } } +sub test_bcf_sr_no_index { + my ($opts) = @_; + + my $test = "test_bcf_sr_no_index"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + # Positive test + test_cmd($opts, out => "bcf-sr/merge.noidx.abc.expected.out", + cmd => "$$opts{path}/test-bcf-sr --no-index -p all --args $vcfdir/merge.noidx.a.vcf $vcfdir/merge.noidx.b.vcf $vcfdir/merge.noidx.c.vcf 2> $$opts{tmp}/no_index_1.err"); + + # Check bad input detection + + my @bad_file_tests = (["out-of-order header", + ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], + ["out-of-order records", + ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], + ["out-of-order records", + ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); + my $count = 2; + foreach my $test_params (@bad_file_tests) { + my ($badness, $inputs) = @$test_params; + my @ins = map { "$vcfdir/$_" } @$inputs; + + my $cmd = "$$opts{path}/test-bcf-sr --no-index -p all --args @ins > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret == 0) { + failed($opts, $test, "Failed to detect $badness: $cmd\n"); + } else { + passed($opts, $test); + } + $count++; + } +} + +sub test_bcf_sr_range { + my ($opts) = @_; + + my $test = "test_bcf_sr_range"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + my @tests = (['r', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['r', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['r', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['r', '{1:1-1}-2', 'weird-chr-names.vcf', undef], # Expected failure + ['t', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['t', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['t', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['t', '{1:1-1}-2', 'weird-chr-names.vcf', undef] # Expected failure + ); + + my $count = 0; + my %converted; + foreach my $tst (@tests) { + my ($option, $range, $in, $exp_out) = @$tst; + $count++; + if (!$converted{$in}) { + my $cmd = "$$opts{path}/test_view -b -p $$opts{tmp}/$in.bcf -x $$opts{tmp}/$in.bcf.csi $vcfdir/$in"; + print "$test:\n\t$cmd\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + failed($opts, $test); + $converted{$in} = 'fail'; + next; + } else { + passed($opts, $test); + $converted{$in} = "$$opts{tmp}/$in.bcf"; + } + } + next if ($converted{$in} eq 'fail'); + my $cmd = "$$opts{path}/test-bcf-sr -O vcf -o $$opts{tmp}/range_test_$count.out.vcf -$option '$range' --args $converted{$in}"; + if ($exp_out) { + test_compare($opts, $cmd, "$vcfdir/$exp_out", + "$$opts{tmp}/range_test_$count.out.vcf", + fix_newlines => 1); + } else { + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + passed($opts, $test); + } else { + failed($opts, $test); + } + } + } +} + sub test_command { my ($opts, %args) = @_; From f4a3b994be8f7904caeb5d58eaa14952ad39f2ea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Jul 2023 09:38:17 +0100 Subject: [PATCH 47/70] Fix a containment bug in cram_index_last. The index is a loaded into a nested containment list, so the last entry in the index array is not necessarily the last slice, as the last slice may be entirely contained within a previous one. Fixes #1639 --- cram/cram_index.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 0cc606f42..39bc7cae0 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -511,7 +511,13 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { slice = fd->index[refid+1].nslice - 1; - return &from->e[slice]; + // e is the last entry in the nested containment list, but it may + // contain further slices within it. + cram_index *e = &from->e[slice]; + while (e->e_next) + e = e->e_next; + + return e; } /* From 84bf64be293d15dc50ec84d060681e19954b7265 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Jun 2023 12:15:10 +0100 Subject: [PATCH 48/70] Migrate base modification code out of sam.c Sam.c is already a behemoth over 6,000 lines long. It's hard to find things in it, and splitting by domain (with a sam prefix) makes life easier as a developer. Note: when sorted by file size, the split off code is already close to the median, so this isn't creating pointlessly small files. The functions are still declared in sam.h there is no API change. This commit changes no functionality as it's simply code migration. --- Makefile | 2 + sam.c | 514 -------------------------------------------------- sam_mods.c | 543 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 545 insertions(+), 514 deletions(-) create mode 100644 sam_mods.c diff --git a/Makefile b/Makefile index 0cfec6e6b..83b49609d 100644 --- a/Makefile +++ b/Makefile @@ -214,6 +214,7 @@ LIBHTS_OBJS = \ regidx.o \ region.o \ sam.o \ + sam_mods.o \ synced_bcf_reader.o \ vcf_sweep.o \ tbx.o \ @@ -457,6 +458,7 @@ hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) diff --git a/sam.c b/sam.c index d77ac7f9d..fc4e677df 100644 --- a/sam.c +++ b/sam.c @@ -6105,517 +6105,3 @@ void bam_mplp_destructor(bam_mplp_t iter, } #endif // ~!defined(BAM_NO_PILEUP) - -// --------------------------- -// Base Modification retrieval -// -// These operate by recording state in an opaque type, allocated and freed -// via the functions below. -// -// Initially we call bam_parse_basemod to process the tags and record the -// modifications in the state structure, and then functions such as -// bam_next_basemod can iterate over this cached state. - -/* - * Base modification are stored in MM/Mm tags as defined as - * - * ::= | "" - * ::= - * - * ::= "A" | "C" | "G" | "T" | "N". - * - * ::= "+" | "-". - * - * ::= | - * ::= | - * ::= - * ::= - * - * ::= "," | ";" - * - * We do not allocate additional memory other than the fixed size - * state, thus we track up to 256 pointers to different locations - * within the MM and ML tags. Each pointer is for a distinct - * modification code (simple or ChEBI), meaning some may point to the - * same delta-list when multiple codes are combined together - * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. - * - * Each numeric in the delta-list is tracked in MMcount[], counted - * down until it hits zero in which case the next delta is fetched. - * - * ML array similarly holds the locations in the quality (ML) tag per - * type, but these are interleaved so C+mhfc,10,15 will have 4 types - * all pointing to the same delta position, but in ML we store - * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML - * also has MLstride indicating how many positions along ML to jump - * each time we consume a base. (4 in our above example, but usually 1 - * for the simple case). - * - * One complexity of the base modification system is that mods are - * always stored in the original DNA orientation. This is so that - * tools that may reverse-complement a sequence (eg "samtools fastq -T - * MM,ML") can pass through these modification tags irrespective of - * whether they have any knowledge of their internal workings. - * - * Because we don't wish to allocate extra memory, we cannot simply - * reverse the MM and ML tags. Sadly this means we have to manage the - * reverse complementing ourselves on-the-fly. - * For reversed reads we start at the right end of MM and no longer - * stop at the semicolon. Instead we use MMend[] array to mark the - * termination point. - */ -#define MAX_BASE_MOD 256 -struct hts_base_mod_state { - int type[MAX_BASE_MOD]; // char or minus-CHEBI - int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) - char strand[MAX_BASE_MOD]; // strand of modification; + or - - int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod - char *MM[MAX_BASE_MOD]; // next pos delta (string) - char *MMend[MAX_BASE_MOD]; // end of pos-delta string - uint8_t *ML[MAX_BASE_MOD]; // next qual - int MLstride[MAX_BASE_MOD]; // bytes between quals for this type - int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? - int seq_pos; // current position along sequence - int nmods; // used array size (0 to MAX_BASE_MOD-1). -}; - -hts_base_mod_state *hts_base_mod_state_alloc(void) { - return calloc(1, sizeof(hts_base_mod_state)); -} - -void hts_base_mod_state_free(hts_base_mod_state *state) { - free(state); -} - -/* - * Count frequency of A, C, G, T and N canonical bases in the sequence - */ -static void seq_freq(const bam1_t *b, int freq[16]) { - int i; - - memset(freq, 0, 16*sizeof(*freq)); - uint8_t *seq = bam_get_seq(b); - for (i = 0; i < b->core.l_qseq; i++) - freq[bam_seqi(seq, i)]++; - freq[15] = b->core.l_qseq; // all bases count as N for base mods -} - -//0123456789ABCDEF -//=ACMGRSVTWYHKDBN aka seq_nt16_str[] -//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str -//084C2A6E195D3B7F -static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; - -/* - * Parse the MM and ML tags to populate the base mod state. - * This structure will have been previously allocated via - * hts_base_mod_state_alloc, but it does not need to be repeatedly - * freed and allocated for each new bam record. (Although obviously - * it requires a new call to this function.) - * - */ -int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { - - //reset position, else upcoming calls may fail on seq pos - length comparison - state->seq_pos = 0; - - // Read MM and ML tags - uint8_t *mm = bam_aux_get(b, "MM"); - if (!mm) mm = bam_aux_get(b, "Mm"); - if (!mm) - return 0; - if (mm[0] != 'Z') { - hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); - return -1; - } - - uint8_t *mi = bam_aux_get(b, "MN"); - if (mi && bam_aux2i(mi) != b->core.l_qseq) { - // bam_aux2i with set errno = EINVAL and return 0 if the tag - // isn't integer, but 0 will be a seq-length mismatch anyway so - // triggers an error here too. - hts_log_error("%s: MM/MN data length is incompatible with" - " SEQ length", bam_get_qname(b)); - return -1; - } - - uint8_t *ml = bam_aux_get(b, "ML"); - if (!ml) ml = bam_aux_get(b, "Ml"); - if (ml && (ml[0] != 'B' || ml[1] != 'C')) { - hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); - return -1; - } - uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; - if (ml) ml += 6; - - // Aggregate freqs of ACGTN if reversed, to get final-delta (later) - int freq[16]; - if (b->core.flag & BAM_FREVERSE) - seq_freq(b, freq); - - char *cp = (char *)mm+1; - int mod_num = 0; - int implicit = 1; - while (*cp) { - for (; *cp; cp++) { - // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; - unsigned char btype = *cp++; - - if (btype != 'A' && btype != 'C' && - btype != 'G' && btype != 'T' && - btype != 'U' && btype != 'N') - return -1; - if (btype == 'U') btype = 'T'; - - btype = seq_nt16_table[btype]; - - // Strand - if (*cp != '+' && *cp != '-') - return -1; // malformed - char strand = *cp++; - - // List of modification types - char *ms = cp, *me; // mod code start and end - char *cp_end = NULL; - int chebi = 0; - if (isdigit_c(*cp)) { - chebi = strtol(cp, &cp_end, 10); - cp = cp_end; - ms = cp-1; - } else { - while (*cp && isalpha_c(*cp)) - cp++; - if (*cp == '\0') - return -1; - } - - me = cp; - - // Optional explicit vs implicit marker - if (*cp == '.') { - // default is implicit = 1; - cp++; - } else if (*cp == '?') { - implicit = 0; - cp++; - } else if (*cp != ',' && *cp != ';') { - // parse error - return -1; - } - - long delta; - int n = 0; // nth symbol in a multi-mod string - int stride = me-ms; - int ndelta = 0; - - if (b->core.flag & BAM_FREVERSE) { - // We process the sequence in left to right order, - // but delta is successive count of bases to skip - // counting right to left. This also means the number - // of bases to skip at left edge is unrecorded (as it's - // the remainder). - // - // To output mods in left to right, we step through the - // MM list in reverse and need to identify the left-end - // "remainder" delta. - int total_seq = 0; - for (;;) { - cp += (*cp == ','); - if (*cp == 0 || *cp == ';') - break; - - delta = strtol(cp, &cp_end, 10); - if (cp_end == cp) { - hts_log_error("%s: Hit end of MM tag. Missing " - "semicolon?", bam_get_qname(b)); - return -1; - } - - cp = cp_end; - total_seq += delta+1; - ndelta++; - } - delta = freq[seqi_rc[btype]] - total_seq; // remainder - } else { - delta = *cp == ',' - ? strtol(cp+1, &cp_end, 10) - : 0; - if (!cp_end) { - // empty list - delta = INT_MAX; - cp_end = cp+1; - } - } - // Now delta is first in list or computed remainder, - // and cp_end is either start or end of the MM list. - while (ms < me) { - state->type [mod_num] = chebi ? -chebi : *ms; - state->strand [mod_num] = (strand == '-'); - state->canonical[mod_num] = btype; - state->MLstride [mod_num] = stride; - state->implicit [mod_num] = implicit; - - if (delta < 0) { - hts_log_error("%s: MM tag refers to bases beyond sequence " - "length", bam_get_qname(b)); - return -1; - } - state->MMcount [mod_num] = delta; - if (b->core.flag & BAM_FREVERSE) { - state->MM [mod_num] = cp+1; - state->MMend[mod_num] = cp_end; - state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; - } else { - state->MM [mod_num] = cp_end; - state->MMend[mod_num] = NULL; - state->ML [mod_num] = ml ? ml+n : NULL; - } - - if (++mod_num >= MAX_BASE_MOD) { - hts_log_error("%s: Too many base modification types", - bam_get_qname(b)); - return -1; - } - ms++; n++; - } - - // Skip modification deltas - if (ml) { - if (b->core.flag & BAM_FREVERSE) { - ml += ndelta*stride; - } else { - while (*cp && *cp != ';') { - if (*cp == ',') - ml+=stride; - cp++; - } - } - if (ml > ml_end) { - hts_log_error("%s: Insufficient number of entries in ML " - "tag", bam_get_qname(b)); - return -1; - } - } else { - // cp_end already known if FREVERSE - if (cp_end && (b->core.flag & BAM_FREVERSE)) - cp = cp_end; - else - while (*cp && *cp != ';') - cp++; - } - if (!*cp) { - hts_log_error("%s: Hit end of MM tag. Missing semicolon?", - bam_get_qname(b)); - return -1; - } - } - } - - state->nmods = mod_num; - - return 0; -} - -/* - * Fills out mods[] with the base modifications found. - * Returns the number found (0 if none), which may be more than - * the size of n_mods if more were found than reported. - * Returns <= -1 on error. - * - * This always marches left to right along sequence, irrespective of - * reverse flag or modification strand. - */ -int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods) { - if (b->core.flag & BAM_FREVERSE) { - if (state->seq_pos < 0) - return -1; - } else { - if (state->seq_pos >= b->core.l_qseq) - return -1; - } - - int i, j, n = 0; - unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); - state->seq_pos++; - if (b->core.flag & BAM_FREVERSE) - base = seqi_rc[base]; - - for (i = 0; i < state->nmods; i++) { - if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) - continue; - - if (state->MMcount[i]-- > 0) - continue; - - char *MMptr = state->MM[i]; - if (n < n_mods) { - mods[n].modified_base = state->type[i]; - mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; - mods[n].strand = state->strand[i]; - mods[n].qual = state->ML[i] ? *state->ML[i] : -1; - } - n++; - if (state->ML[i]) - state->ML[i] += (b->core.flag & BAM_FREVERSE) - ? -state->MLstride[i] - : +state->MLstride[i]; - - if (b->core.flag & BAM_FREVERSE) { - // process MM list backwards - char *cp; - for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) - if (*cp == ',') - break; - state->MMend[i] = cp; - if (cp != state->MM[i]) - state->MMcount[i] = strtol(cp+1, NULL, 10); - else - state->MMcount[i] = INT_MAX; - } else { - if (*state->MM[i] == ',') - state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); - else - state->MMcount[i] = INT_MAX; - } - - // Multiple mods at the same coords. - for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { - if (n < n_mods) { - mods[n].modified_base = state->type[j]; - mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; - mods[n].strand = state->strand[j]; - mods[n].qual = state->ML[j] ? *state->ML[j] : -1; - } - n++; - state->MMcount[j] = state->MMcount[i]; - state->MM[j] = state->MM[i]; - if (state->ML[j]) - state->ML[j] += (b->core.flag & BAM_FREVERSE) - ? -state->MLstride[j] - : +state->MLstride[j]; - } - i = j-1; - } - - return n; -} - -/* - * Looks for the next location with a base modification. - */ -int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods, int *pos) { - if (state->seq_pos >= b->core.l_qseq) - return 0; - - // Look through state->MMcount arrays to see when the next lowest is - // per base type; - int next[16], freq[16] = {0}, i; - memset(next, 0x7f, 16*sizeof(*next)); - if (b->core.flag & BAM_FREVERSE) { - for (i = 0; i < state->nmods; i++) { - if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) - next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; - } - } else { - for (i = 0; i < state->nmods; i++) { - if (next[state->canonical[i]] > state->MMcount[i]) - next[state->canonical[i]] = state->MMcount[i]; - } - } - - // Now step through the sequence counting off base types. - for (i = state->seq_pos; i < b->core.l_qseq; i++) { - unsigned char bc = bam_seqi(bam_get_seq(b), i); - if (next[bc] <= freq[bc] || next[15] <= freq[15]) - break; - freq[bc]++; - if (bc != 15) // N - freq[15]++; - } - *pos = state->seq_pos = i; - - if (i >= b->core.l_qseq) { - // Check for more MM elements than bases present. - for (i = 0; i < state->nmods; i++) { - if (!(b->core.flag & BAM_FREVERSE) && - state->MMcount[i] < 0x7f000000) { - hts_log_warning("MM tag refers to bases beyond sequence length"); - return -1; - } - } - return 0; - } - - if (b->core.flag & BAM_FREVERSE) { - for (i = 0; i < state->nmods; i++) - state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; - } else { - for (i = 0; i < state->nmods; i++) - state->MMcount[i] -= freq[state->canonical[i]]; - } - - int r = bam_mods_at_next_pos(b, state, mods, n_mods); - return r > 0 ? r : 0; -} - -/* - * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. - * This can only march forwards along the read, but can do so by more than - * one base-pair. - * - * This makes it useful for calling from pileup iterators where qpos may - * start part way through a read for the first occurrence of that record. - */ -int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods) { - // FIXME: for now this is inefficient in implementation. - int r = 0; - while (state->seq_pos <= qpos) - if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) - break; - - return r; -} - -/* - * Returns the list of base modification codes provided for this - * alignment record as an array of character codes (+ve) or ChEBI numbers - * (negative). - * - * Returns the array, with *ntype filled out with the size. - * The array returned should not be freed. - * It is a valid pointer until the state is freed using - * hts_base_mod_free(). - */ -int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { - *ntype = state->nmods; - return state->type; -} - -/* - * Returns data about a specific modification type for the alignment record. - * Code is either positive (eg 'm') or negative for ChEBI numbers. - * - * Return 0 on success or -1 if not found. The strand, implicit and canonical - * fields are filled out if passed in as non-NULL pointers. - */ -int bam_mods_query_type(hts_base_mod_state *state, int code, - int *strand, int *implicit, char *canonical) { - // Find code entry - int i; - for (i = 0; i < state->nmods; i++) { - if (state->type[i] == code) - break; - } - if (i == state->nmods) - return -1; - - // Return data - if (strand) *strand = state->strand[i]; - if (implicit) *implicit = state->implicit[i]; - if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; - - return 0; -} diff --git a/sam_mods.c b/sam_mods.c new file mode 100644 index 000000000..68259a2ea --- /dev/null +++ b/sam_mods.c @@ -0,0 +1,543 @@ +/* sam_mods.c -- Base modification handling in SAM and BAM. + + Copyright (C) 2020-2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include "htslib/sam.h" +#include "textutils_internal.h" + +// --------------------------- +// Base Modification retrieval +// +// These operate by recording state in an opaque type, allocated and freed +// via the functions below. +// +// Initially we call bam_parse_basemod to process the tags and record the +// modifications in the state structure, and then functions such as +// bam_next_basemod can iterate over this cached state. + +/* + * Base modification are stored in MM/Mm tags as defined as + * + * ::= | "" + * ::= + * + * ::= "A" | "C" | "G" | "T" | "N". + * + * ::= "+" | "-". + * + * ::= | + * ::= | + * ::= + * ::= + * + * ::= "," | ";" + * + * We do not allocate additional memory other than the fixed size + * state, thus we track up to 256 pointers to different locations + * within the MM and ML tags. Each pointer is for a distinct + * modification code (simple or ChEBI), meaning some may point to the + * same delta-list when multiple codes are combined together + * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. + * + * Each numeric in the delta-list is tracked in MMcount[], counted + * down until it hits zero in which case the next delta is fetched. + * + * ML array similarly holds the locations in the quality (ML) tag per + * type, but these are interleaved so C+mhfc,10,15 will have 4 types + * all pointing to the same delta position, but in ML we store + * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML + * also has MLstride indicating how many positions along ML to jump + * each time we consume a base. (4 in our above example, but usually 1 + * for the simple case). + * + * One complexity of the base modification system is that mods are + * always stored in the original DNA orientation. This is so that + * tools that may reverse-complement a sequence (eg "samtools fastq -T + * MM,ML") can pass through these modification tags irrespective of + * whether they have any knowledge of their internal workings. + * + * Because we don't wish to allocate extra memory, we cannot simply + * reverse the MM and ML tags. Sadly this means we have to manage the + * reverse complementing ourselves on-the-fly. + * For reversed reads we start at the right end of MM and no longer + * stop at the semicolon. Instead we use MMend[] array to mark the + * termination point. + */ +#define MAX_BASE_MOD 256 +struct hts_base_mod_state { + int type[MAX_BASE_MOD]; // char or minus-CHEBI + int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) + char strand[MAX_BASE_MOD]; // strand of modification; + or - + int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod + char *MM[MAX_BASE_MOD]; // next pos delta (string) + char *MMend[MAX_BASE_MOD]; // end of pos-delta string + uint8_t *ML[MAX_BASE_MOD]; // next qual + int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? + int seq_pos; // current position along sequence + int nmods; // used array size (0 to MAX_BASE_MOD-1). +}; + +hts_base_mod_state *hts_base_mod_state_alloc(void) { + return calloc(1, sizeof(hts_base_mod_state)); +} + +void hts_base_mod_state_free(hts_base_mod_state *state) { + free(state); +} + +/* + * Count frequency of A, C, G, T and N canonical bases in the sequence + */ +static void seq_freq(const bam1_t *b, int freq[16]) { + int i; + + memset(freq, 0, 16*sizeof(*freq)); + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + freq[bam_seqi(seq, i)]++; + freq[15] = b->core.l_qseq; // all bases count as N for base mods +} + +//0123456789ABCDEF +//=ACMGRSVTWYHKDBN aka seq_nt16_str[] +//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str +//084C2A6E195D3B7F +static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; + +/* + * Parse the MM and ML tags to populate the base mod state. + * This structure will have been previously allocated via + * hts_base_mod_state_alloc, but it does not need to be repeatedly + * freed and allocated for each new bam record. (Although obviously + * it requires a new call to this function.) + * + */ +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + // Reset position, else upcoming calls may fail on + // seq pos - length comparison + state->seq_pos = 0; + + // Read MM and ML tags + uint8_t *mm = bam_aux_get(b, "MM"); + if (!mm) mm = bam_aux_get(b, "Mm"); + if (!mm) + return 0; + if (mm[0] != 'Z') { + hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); + return -1; + } + + uint8_t *mi = bam_aux_get(b, "MN"); + if (mi && bam_aux2i(mi) != b->core.l_qseq) { + // bam_aux2i with set errno = EINVAL and return 0 if the tag + // isn't integer, but 0 will be a seq-length mismatch anyway so + // triggers an error here too. + hts_log_error("%s: MM/MN data length is incompatible with" + " SEQ length", bam_get_qname(b)); + return -1; + } + + uint8_t *ml = bam_aux_get(b, "ML"); + if (!ml) ml = bam_aux_get(b, "Ml"); + if (ml && (ml[0] != 'B' || ml[1] != 'C')) { + hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); + return -1; + } + uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; + if (ml) ml += 6; + + // Aggregate freqs of ACGTN if reversed, to get final-delta (later) + int freq[16]; + if (b->core.flag & BAM_FREVERSE) + seq_freq(b, freq); + + char *cp = (char *)mm+1; + int mod_num = 0; + int implicit = 1; + while (*cp) { + for (; *cp; cp++) { + // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; + unsigned char btype = *cp++; + + if (btype != 'A' && btype != 'C' && + btype != 'G' && btype != 'T' && + btype != 'U' && btype != 'N') + return -1; + if (btype == 'U') btype = 'T'; + + btype = seq_nt16_table[btype]; + + // Strand + if (*cp != '+' && *cp != '-') + return -1; // malformed + char strand = *cp++; + + // List of modification types + char *ms = cp, *me; // mod code start and end + char *cp_end = NULL; + int chebi = 0; + if (isdigit_c(*cp)) { + chebi = strtol(cp, &cp_end, 10); + cp = cp_end; + ms = cp-1; + } else { + while (*cp && isalpha_c(*cp)) + cp++; + if (*cp == '\0') + return -1; + } + + me = cp; + + // Optional explicit vs implicit marker + if (*cp == '.') { + // default is implicit = 1; + cp++; + } else if (*cp == '?') { + implicit = 0; + cp++; + } else if (*cp != ',' && *cp != ';') { + // parse error + return -1; + } + + long delta; + int n = 0; // nth symbol in a multi-mod string + int stride = me-ms; + int ndelta = 0; + + if (b->core.flag & BAM_FREVERSE) { + // We process the sequence in left to right order, + // but delta is successive count of bases to skip + // counting right to left. This also means the number + // of bases to skip at left edge is unrecorded (as it's + // the remainder). + // + // To output mods in left to right, we step through the + // MM list in reverse and need to identify the left-end + // "remainder" delta. + int total_seq = 0; + for (;;) { + cp += (*cp == ','); + if (*cp == 0 || *cp == ';') + break; + + delta = strtol(cp, &cp_end, 10); + if (cp_end == cp) { + hts_log_error("%s: Hit end of MM tag. Missing " + "semicolon?", bam_get_qname(b)); + return -1; + } + + cp = cp_end; + total_seq += delta+1; + ndelta++; + } + delta = freq[seqi_rc[btype]] - total_seq; // remainder + } else { + delta = *cp == ',' + ? strtol(cp+1, &cp_end, 10) + : 0; + if (!cp_end) { + // empty list + delta = INT_MAX; + cp_end = cp+1; + } + } + // Now delta is first in list or computed remainder, + // and cp_end is either start or end of the MM list. + while (ms < me) { + state->type [mod_num] = chebi ? -chebi : *ms; + state->strand [mod_num] = (strand == '-'); + state->canonical[mod_num] = btype; + state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; + + if (delta < 0) { + hts_log_error("%s: MM tag refers to bases beyond sequence " + "length", bam_get_qname(b)); + return -1; + } + state->MMcount [mod_num] = delta; + if (b->core.flag & BAM_FREVERSE) { + state->MM [mod_num] = cp+1; + state->MMend[mod_num] = cp_end; + state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; + } else { + state->MM [mod_num] = cp_end; + state->MMend[mod_num] = NULL; + state->ML [mod_num] = ml ? ml+n : NULL; + } + + if (++mod_num >= MAX_BASE_MOD) { + hts_log_error("%s: Too many base modification types", + bam_get_qname(b)); + return -1; + } + ms++; n++; + } + + // Skip modification deltas + if (ml) { + if (b->core.flag & BAM_FREVERSE) { + ml += ndelta*stride; + } else { + while (*cp && *cp != ';') { + if (*cp == ',') + ml+=stride; + cp++; + } + } + if (ml > ml_end) { + hts_log_error("%s: Insufficient number of entries in ML " + "tag", bam_get_qname(b)); + return -1; + } + } else { + // cp_end already known if FREVERSE + if (cp_end && (b->core.flag & BAM_FREVERSE)) + cp = cp_end; + else + while (*cp && *cp != ';') + cp++; + } + if (!*cp) { + hts_log_error("%s: Hit end of MM tag. Missing semicolon?", + bam_get_qname(b)); + return -1; + } + } + } + + state->nmods = mod_num; + + return 0; +} + +/* + * Fills out mods[] with the base modifications found. + * Returns the number found (0 if none), which may be more than + * the size of n_mods if more were found than reported. + * Returns <= -1 on error. + * + * This always marches left to right along sequence, irrespective of + * reverse flag or modification strand. + */ +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + if (b->core.flag & BAM_FREVERSE) { + if (state->seq_pos < 0) + return -1; + } else { + if (state->seq_pos >= b->core.l_qseq) + return -1; + } + + int i, j, n = 0; + unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); + state->seq_pos++; + if (b->core.flag & BAM_FREVERSE) + base = seqi_rc[base]; + + for (i = 0; i < state->nmods; i++) { + if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) + continue; + + if (state->MMcount[i]-- > 0) + continue; + + char *MMptr = state->MM[i]; + if (n < n_mods) { + mods[n].modified_base = state->type[i]; + mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; + mods[n].strand = state->strand[i]; + mods[n].qual = state->ML[i] ? *state->ML[i] : -1; + } + n++; + if (state->ML[i]) + state->ML[i] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[i] + : +state->MLstride[i]; + + if (b->core.flag & BAM_FREVERSE) { + // process MM list backwards + char *cp; + for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) + if (*cp == ',') + break; + state->MMend[i] = cp; + if (cp != state->MM[i]) + state->MMcount[i] = strtol(cp+1, NULL, 10); + else + state->MMcount[i] = INT_MAX; + } else { + if (*state->MM[i] == ',') + state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); + else + state->MMcount[i] = INT_MAX; + } + + // Multiple mods at the same coords. + for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { + if (n < n_mods) { + mods[n].modified_base = state->type[j]; + mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; + mods[n].strand = state->strand[j]; + mods[n].qual = state->ML[j] ? *state->ML[j] : -1; + } + n++; + state->MMcount[j] = state->MMcount[i]; + state->MM[j] = state->MM[i]; + if (state->ML[j]) + state->ML[j] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[j] + : +state->MLstride[j]; + } + i = j-1; + } + + return n; +} + +/* + * Looks for the next location with a base modification. + */ +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos) { + if (state->seq_pos >= b->core.l_qseq) + return 0; + + // Look through state->MMcount arrays to see when the next lowest is + // per base type; + int next[16], freq[16] = {0}, i; + memset(next, 0x7f, 16*sizeof(*next)); + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) { + if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; + } + } else { + for (i = 0; i < state->nmods; i++) { + if (next[state->canonical[i]] > state->MMcount[i]) + next[state->canonical[i]] = state->MMcount[i]; + } + } + + // Now step through the sequence counting off base types. + for (i = state->seq_pos; i < b->core.l_qseq; i++) { + unsigned char bc = bam_seqi(bam_get_seq(b), i); + if (next[bc] <= freq[bc] || next[15] <= freq[15]) + break; + freq[bc]++; + if (bc != 15) // N + freq[15]++; + } + *pos = state->seq_pos = i; + + if (i >= b->core.l_qseq) { + // Check for more MM elements than bases present. + for (i = 0; i < state->nmods; i++) { + if (!(b->core.flag & BAM_FREVERSE) && + state->MMcount[i] < 0x7f000000) { + hts_log_warning("MM tag refers to bases beyond sequence length"); + return -1; + } + } + return 0; + } + + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; + } else { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[state->canonical[i]]; + } + + int r = bam_mods_at_next_pos(b, state, mods, n_mods); + return r > 0 ? r : 0; +} + +/* + * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. + * This can only march forwards along the read, but can do so by more than + * one base-pair. + * + * This makes it useful for calling from pileup iterators where qpos may + * start part way through a read for the first occurrence of that record. + */ +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + // FIXME: for now this is inefficient in implementation. + int r = 0; + while (state->seq_pos <= qpos) + if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) + break; + + return r; +} + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} From 60ea4e06f01efa7b4c89c91904ab16eeda6557eb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 11:49:37 +0100 Subject: [PATCH 49/70] Correct base modification implicit / explicit status when mixed together. We didn't reset back to implicit after an explicit mod, so "C+m?,4;G+o,2;" would set "m" to be explicit (?) and also leave "o" as explicit. --- sam_mods.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sam_mods.c b/sam_mods.c index 68259a2ea..4b6172e16 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -214,6 +214,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { me = cp; // Optional explicit vs implicit marker + implicit = 1; if (*cp == '.') { // default is implicit = 1; cp++; From 27e813cfa3ad1cf866ffb83fc2d9eb41daafe257 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 11:57:33 +0100 Subject: [PATCH 50/70] Add a bam_mods_queryi interface. This allows querying by the i^{th} modificaton type rather than by code. This is useful when we have multiple mods with differing meta-data, such as "C+m.,4;G-m?,1;". The previous bam_mods_query_type isn't sufficient as it's the same code "m" being used. Fixes #1550 --- htslib/sam.h | 20 ++++++++++++++++++++ sam_mods.c | 19 +++++++++++++++++++ test/base_mods/MM-chebi.out | 2 +- test/base_mods/MM-double.out | 2 +- test/base_mods/MM-explicit-x.out | 6 +++--- test/base_mods/MM-explicit.out | 6 +++--- test/base_mods/MM-multi.out | 4 ++-- test/test_mod.c | 7 ++++++- 8 files changed, 55 insertions(+), 11 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index fe5b1ebdc..3ba29f001 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2349,6 +2349,26 @@ HTSLIB_EXPORT int bam_mods_query_type(hts_base_mod_state *state, int code, int *strand, int *implicit, char *canonical); +/// Returns data about the i^th modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param i Modification index, from 0 to ntype-1 + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical); + /// Returns the list of base modification codes provided for this /// alignment record as an array of character codes (+ve) or ChEBI numbers /// (negative). diff --git a/sam_mods.c b/sam_mods.c index 4b6172e16..e5bdb1b6c 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -542,3 +542,22 @@ int bam_mods_query_type(hts_base_mod_state *state, int code, return 0; } + +/* + * Returns data about the ith modification type for the alignment record. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical) { + if (i < 0 || i >= state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index a6e7654cf..8df8130df 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -35,7 +35,7 @@ 34 C C+m204 C+(76792)33 35 A --- -Present: m #-76792 n +Present: m. #-76792. n. 6 C C+m102 15 N N+n212 17 C C+m128 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index e21ae314e..e346192b8 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -35,7 +35,7 @@ 34 A 35 T --- -Present: m m o +Present: m. m. o. 1 G G-m115 7 C C+m128 12 G G-m141 diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out index 4abedc719..4078543ca 100644 --- a/test/base_mods/MM-explicit-x.out +++ b/test/base_mods/MM-explicit-x.out @@ -24,7 +24,7 @@ 23 C 24 T --- -Present: m h +Present: m. h. 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -57,7 +57,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m? h? 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -92,7 +92,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m. h? 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index f28b25f83..186e790ae 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -24,7 +24,7 @@ 23 C 24 T --- -Present: m h +Present: m. h. 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -57,7 +57,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m? h? 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -92,7 +92,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m. h? 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index e411a81ee..73b480b80 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -35,7 +35,7 @@ 34 C C+m230 C+h6 35 A --- -Present: m h n +Present: m. h. n. 6 C C+m128 15 N N+n215 17 C C+m153 @@ -84,7 +84,7 @@ Present: m h n 34 C C+m204 C+h31 35 A --- -Present: m h n +Present: m. h. n. 6 C C+m77 C+h159 15 N N+n240 17 C C+m103 C+h133 diff --git a/test/test_mod.c b/test/test_mod.c index 42768341d..1b53b9e91 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -166,8 +166,13 @@ int main(int argc, char **argv) { int all_mods_n = 0; all_mods = bam_mods_recorded(m, &all_mods_n); printf("Present:"); - for (i = 0; i < all_mods_n; i++) + for (i = 0; i < all_mods_n; i++) { + int m_strand, m_implicit; + char m_canonical; + bam_mods_queryi(m, i, &m_strand, &m_implicit, &m_canonical); printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar("?."[m_implicit]); + } putchar('\n'); int pos; From fea4ef94bda646f17079a8a28e318ff9a46096c8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Jun 2023 17:20:34 +0100 Subject: [PATCH 51/70] Add bam_parse_basemod2 API with additional flags argument. The only flag at the moment is HTS_MOD_REPORT_UNCHECKED. This changes bam_mods_at_next_pos to report modified bases with qual=HTS_MOD_UNCHECKED for explicitly modification types that do not have coverage for this specific position. For consistency, also set the unknown qual from -1 to HTS_MOD_UNKNOWN (#defined to -1). This is used when ML is absent. Arguably this could be 255 to match things like unknown MAPQ, but this ship has sailed. The test/test_mod tool has a -f INT argument to specify the basemod2 flags. TODO: modify other functions too. TODO: add tests, based on MM-explicit.sam --- htslib/sam.h | 26 ++++++++++++++++++++++++++ sam_mods.c | 31 +++++++++++++++++++++++++++---- test/test_mod.c | 25 ++++++++++++++++++++----- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 3ba29f001..483cc67d3 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2224,6 +2224,12 @@ typedef struct hts_base_mod { int qual; } hts_base_mod; +#define HTS_MOD_UNKNOWN -1 // In MM but no ML +#define HTS_MOD_UNCHECKED -2 // Not in MM and in explicit mode + +// Flags for hts_parse_basemod2 +#define HTS_MOD_REPORT_UNCHECKED 1 + /// Allocates an hts_base_mode_state. /** * @return An hts_base_mode_state pointer on success, @@ -2260,6 +2266,22 @@ void hts_base_mod_state_free(hts_base_mod_state *state); HTSLIB_EXPORT int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state); +/// Parses the Mm and Ml tags out of a bam record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param flags A bit-field controlling base modification processing + * + * @return 0 on success, + * -1 on failure. + * + * This fills out the contents of the modification state, resetting the + * iterator location to the first sequence base. + */ +HTSLIB_EXPORT +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags); + /// Returns modification status for the next base position in the query seq. /** * @param b BAM alignment record @@ -2281,6 +2303,10 @@ HTSLIB_EXPORT int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); +HTSLIB_EXPORT +int bam_mods_at_next_pos2(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + /// Finds the next location containing base modifications and returns them /** * @param b BAM alignment record diff --git a/sam_mods.c b/sam_mods.c index e5bdb1b6c..245e96e1c 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -99,6 +99,7 @@ struct hts_base_mod_state { int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? int seq_pos; // current position along sequence int nmods; // used array size (0 to MAX_BASE_MOD-1). + uint32_t flags; // Bit-field: see HTS_MOD_REPORT_UNCHECKED }; hts_base_mod_state *hts_base_mod_state_alloc(void) { @@ -135,11 +136,17 @@ static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; * freed and allocated for each new bam record. (Although obviously * it requires a new call to this function.) * + * Flags are copied into the state and used to control reporting functions. + * Currently the only flag is HTS_MOD_REPORT_UNCHECKED, to control whether + * explicit "C+m?" mods report quality HTS_MOD_UNCHECKED for the bases + * outside the explicitly reported region. */ -int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags) { // Reset position, else upcoming calls may fail on // seq pos - length comparison state->seq_pos = 0; + state->flags = flags; // Read MM and ML tags uint8_t *mm = bam_aux_get(b, "MM"); @@ -339,6 +346,10 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { return 0; } +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + return bam_parse_basemod2(b, state, 0); +} + /* * Fills out mods[] with the base modifications found. * Returns the number found (0 if none), which may be more than @@ -365,20 +376,32 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, base = seqi_rc[base]; for (i = 0; i < state->nmods; i++) { + int unchecked = 0; if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) continue; - if (state->MMcount[i]-- > 0) - continue; + if (state->MMcount[i]-- > 0) { + if (!state->implicit[i] && + (state->flags & HTS_MOD_REPORT_UNCHECKED)) + unchecked = 1; + else + continue; + } char *MMptr = state->MM[i]; if (n < n_mods) { mods[n].modified_base = state->type[i]; mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; mods[n].strand = state->strand[i]; - mods[n].qual = state->ML[i] ? *state->ML[i] : -1; + mods[n].qual = unchecked + ? HTS_MOD_UNCHECKED + : (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN); } n++; + + if (unchecked) + continue; + if (state->ML[i]) state->ML[i] += (b->core.flag & BAM_FREVERSE) ? -state->MLstride[i] diff --git a/test/test_mod.c b/test/test_mod.c index 1b53b9e91..43447bb36 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -88,6 +88,7 @@ static char *code(int id) { int main(int argc, char **argv) { char out[1024] = {0}; int extended = 0; + uint32_t flags = 0; if (argc > 1 && strcmp(argv[1], "-x") == 0) { extended = 1; @@ -95,6 +96,12 @@ int main(int argc, char **argv) { argc--; } + if (argc > 2 && strcmp(argv[1], "-f") == 0) { + flags = atoi(argv[2]); + argv+=2; + argc-=2; + } + if (argc < 2) return 1; @@ -110,7 +117,7 @@ int main(int argc, char **argv) { int r; while ((r = sam_read1(in, h, b)) >= 0) { - if (bam_parse_basemod(b, m) < 0) { + if (bam_parse_basemod2(b, m, flags) < 0) { fprintf(stderr, "Failed to parse MM/ML aux tags\n"); goto err; } @@ -124,6 +131,14 @@ int main(int argc, char **argv) { lp += snprintf(lp, ep - lp, "%d\t%c\t", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + if (extended) { int m_strand, m_implicit; char m_canonical; @@ -134,18 +149,18 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += snprintf(lp, ep - lp, "%c%c%s%c%d ", + lp += snprintf(lp, ep - lp, "%c%c%s%c%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), "?."[m_implicit], - mods[j].qual); + qstr); } else { - lp += snprintf(lp, ep - lp, "%c%c%s%d ", + lp += snprintf(lp, ep - lp, "%c%c%s%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), - mods[j].qual); + qstr); } } *lp++ = '\n'; From a09710cdbef3745933e4c5de26cb76467782acaa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 16:25:43 +0100 Subject: [PATCH 52/70] Add more internal sam_mods.c documentation --- sam_mods.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/sam_mods.c b/sam_mods.c index 245e96e1c..f22baca04 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -38,6 +38,92 @@ DEALINGS IN THE SOFTWARE. */ // modifications in the state structure, and then functions such as // bam_next_basemod can iterate over this cached state. +/* Overview of API. + +We start by allocating an hts_base_mod_state and parsing the MM, ML and MN +tags into it. This has optional flags controlling how we report base +modifications in "explicit" coordinates. See below + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod2(b, m, HTS_MOD_REPORT_UNCHECKED); + // Or: bam_parse_basemod(b, m), which is equiv to flags==0 + //... do something ... + hts_base_mod_state_free(m); + +In the default implicit MM coordinate system, any location not +reported is implicitly assumed to contain no modification. We only +report the places we think are likely modified. + +Some tools however only look for base modifications in particular +contexts, eg CpG islands. Here we need to distinguish between +not-looked-for and looked-for-but-didn't-find. These calls have an +explicit coordinate system, where we only know information about the +coordinates explicitly listed and everything else is considered to be +unverified. + +By default we don't get reports on the other coordinates in an +explicit MM tag, but the HTS_MOD_REPORT_UNCHECKED flag will report +them (with quality HTS_MOD_UNCHECKED) meaning we can do consensus +modification analysis with accurate counting when dealing with a +mixture of explicit and implicit records. + + +We have different ways of processing the base modifications. We can +iterate either mod-by-mod or position-by-position, or we can simply +query a specific coordinate as may be done when processing a pileup. + +To check for base modifications as a specific location within a +sequence we can use bam_mods_at_qpos. This provides complete random +access within the MM string. However currently this is inefficiently +implemented so should only be used for occasional analysis or as a way +to start iterating at a specific location. It modifies the state +position, so after the first use we can then switch to +bam_mods_at_next_pos to iterate position by position from then on. + + hts_base_mod mods[10]; + int n = bam_mods_at_qpos(b, pos, m, mods, 10); + +For base by base, we have bam_mods_at_next_pos. This strictly starts +at the first base and reports entries one at a time. It's more +efficient than a loop repeatedly calling ...at-pos. + + hts_base_mod mods[10]; + int n = bam_mods_at_next_pos(b, m, mods, 10); + for (int i = 0; i < n; i++) { + // report mod i of n + } + +Iterating over modifications instead of coordinates is simpler and +more efficient as it skips reporting of unmodified bases. This is +done with bam_next_basemod. Note this does not yet honour the +HTS_MOD_REPORT_UNCHECKED flag. + + hts_base_mod mods[10]; + while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { + for (j = 0; j < n; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + +There are also functions that query meta-data about the MM line rather +than per-site information. + +bam_mods_recorded returns an array of ints holding the +ve code ('m') +or -ve CHEBI numeric values. + + int ntypes, *types = bam_mods_recorded(m, &ntype); + +We can then query a specific modification type to get further +information on the strand it is operating on, whether it has implicit +or explicit coordinates, and what it's corresponding canonical base it +is (The "C" in "C+m"). bam_mods_query_type does this by code name, +while bam_mods_queryi does this by numeric i^{th} type (from 0 to ntype-1). + + bam_mods_query_type(m, 'c', &strand, &implicit, &canonical); + bam_mods_queryi(m, 2, &strand, &implicit, &canonical); + +*/ + /* * Base modification are stored in MM/Mm tags as defined as * From 7822d8d0553e3d93516956b588f9db8086204499 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 30 Jun 2023 15:25:37 +0100 Subject: [PATCH 53/70] Update bam_next_basemod too to cope with HTS_MOD_REPORT_UNCHECKED. Also improve tests --- htslib/sam.h | 4 -- sam_mods.c | 22 ++++-- test/base_mods/MM-explicit-f.out | 111 +++++++++++++++++++++++++++++++ test/base_mods/base-mods.tst | 5 ++ test/test_mod.c | 14 +++- 5 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 test/base_mods/MM-explicit-f.out diff --git a/htslib/sam.h b/htslib/sam.h index 483cc67d3..cffa04701 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2303,10 +2303,6 @@ HTSLIB_EXPORT int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); -HTSLIB_EXPORT -int bam_mods_at_next_pos2(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods); - /// Finds the next location containing base modifications and returns them /** * @param b BAM alignment record diff --git a/sam_mods.c b/sam_mods.c index f22baca04..3eb042328 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -95,8 +95,7 @@ efficient than a loop repeatedly calling ...at-pos. Iterating over modifications instead of coordinates is simpler and more efficient as it skips reporting of unmodified bases. This is -done with bam_next_basemod. Note this does not yet honour the -HTS_MOD_REPORT_UNCHECKED flag. +done with bam_next_basemod. hts_base_mod mods[10]; while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { @@ -534,7 +533,13 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, } /* - * Looks for the next location with a base modification. + * Return data at the next modified location. + * + * bam_mods_at_next_pos does quite a bit of work, so we don't want to + * repeatedly call it for every location until we find a mod. Instead + * we check how many base types we can consume before the next mod, + * and scan through the sequence looking for them. Once we're at that + * site, we defer back to bam_mods_at_next_pos for the return values. */ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods, int *pos) { @@ -545,16 +550,21 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, // per base type; int next[16], freq[16] = {0}, i; memset(next, 0x7f, 16*sizeof(*next)); + const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED; if (b->core.flag & BAM_FREVERSE) { for (i = 0; i < state->nmods; i++) { - if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + if (unchecked && !state->implicit[i]) + next[seqi_rc[state->canonical[i]]] = 1; + else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; } } else { for (i = 0; i < state->nmods; i++) { - if (next[state->canonical[i]] > state->MMcount[i]) + if (unchecked && !state->implicit[i]) + next[state->canonical[i]] = 0; + else if (next[state->canonical[i]] > state->MMcount[i]) next[state->canonical[i]] = state->MMcount[i]; - } + } } // Now step through the sequence counting off base types. diff --git a/test/base_mods/MM-explicit-f.out b/test/base_mods/MM-explicit-f.out new file mode 100644 index 000000000..6462c99b9 --- /dev/null +++ b/test/base_mods/MM-explicit-f.out @@ -0,0 +1,111 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h. +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C C+m# C+h# +3 A +4 T +5 C C+m# C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+m# C+h# +23 C C+m# C+h# +24 T +--- +Present: m? h? +2 C C+m# C+h# +5 C C+m# C+h# +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 +22 C C+m# C+h# +23 C C+m# C+h# + +=== + +0 A +1 T +2 C C+h# +3 A +4 T +5 C C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+h# +23 C C+h# +24 T +--- +Present: m. h? +2 C C+h# +5 C C+h# +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 +22 C C+h# +23 C C+h# + +=== + diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 237f7906c..ff1f7651a 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -39,6 +39,11 @@ P MM-multi.out $test_mod MM-multi.sam P MM-explicit.out $test_mod MM-explicit.sam P MM-explicit-x.out $test_mod -x MM-explicit.sam +# Report bases outside the explicitly called ranges, so we could exclude +# these in any depth based consensus analysis and only gather statistics +# for sites known to be have been scanned. +P MM-explicit-f.out $test_mod -f 1 MM-explicit.sam + # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam diff --git a/test/test_mod.c b/test/test_mod.c index 43447bb36..e59da4827 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { if (argc > 1) puts("---"); - bam_parse_basemod(b, m); + bam_parse_basemod2(b, m, flags); // List possible mod choices. int *all_mods; @@ -196,11 +196,19 @@ int main(int argc, char **argv) { lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { - lp += snprintf(lp, ep - lp, "%c%c%s%d ", + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + + lp += snprintf(lp, ep - lp, "%c%c%s%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), - mods[j].qual); + qstr); } *lp++ = '\n'; *lp++ = 0; From 6e3e8dafa3c5c4b016fc1f0ac8f56eefcd1dda60 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 30 Jun 2023 16:16:41 +0100 Subject: [PATCH 54/70] Fix possible double frees in bcf_hdr_add_hrec() error handling bcf_hdr_add_hrec() should neither call bcf_hrec_destroy(hrec) nor store any pointers to hrec when it returns -1, otherwise double frees or stale pointer dereferences may result. Remove bcf_hrec_destroy(hrec) call that was incorrectly made when handling a hash table insert failure, and move hdr->hrec reallocation so that all possible failures occur before hrec is added into the header. Add bcf_hdr_add_hrec() documentation, including a warning that the caller should not touch hrec after a successful return. --- htslib/vcf.h | 16 ++++++++++++++++ vcf.c | 16 ++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 0d9f812ce..8bbf480c8 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -691,6 +691,22 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + /// Add a header record into a header + /** + * @param hdr Destination header + * @param hrec Header record + * @return 0 on success, -1 on failure + * + * If this function returns success, ownership of @p hrec will have + * been transferred to the header structure. It may also have been + * freed if it was a duplicate of a record already in the header. + * Therefore the @p hrec pointer should not be used after a successful + * return from this function. + * + * If this function returns failure, ownership will not have been taken + * and the caller is responsible for cleaning up @p hrec. + */ + HTSLIB_EXPORT int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); diff --git a/vcf.c b/vcf.c index 392d9c932..6dc7aec32 100644 --- a/vcf.c +++ b/vcf.c @@ -974,23 +974,27 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 0; } } + + // New record, needs to be added + int n = hdr->nhrec + 1; + bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); + if (!new_hrec) { + free(str.s); + return -1; + } + hdr->hrec = new_hrec; + if ( str.s ) { khint_t k = kh_put(hdict, aux->gen, str.s, &res); if ( res<0 ) { - bcf_hrec_destroy(hrec); free(str.s); return -1; } kh_val(aux->gen,k) = hrec; } - // New record, needs to be added - int n = hdr->nhrec + 1; - bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); - if (!new_hrec) return -1; - hdr->hrec = new_hrec; hdr->hrec[hdr->nhrec] = hrec; hdr->dirty = 1; hdr->nhrec = n; From 6ad0fffc8c71d617c8433665b75071f09ab423a3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 6 Jul 2023 15:29:15 +0100 Subject: [PATCH 55/70] Prevent dangling hrec pointer after bcf_hdr_add_hrec() failure Borrow a bit of code from bcf_hdr_remove() that removes hrec pointers from the hdr->dict[] dictionaries, turn it into bcf_hdr_unregister_hrec() and use it to clean up the dictionary should bcf_hdr_add_hrec() fail. There's actually only one place in bcf_hdr_add_hrec() where this needs to be called. In all other paths returning an error, either the hrec type is not one that needs to be cleaned up, or hrec will not have been added to the dictionary. bcf_hdr_remove() is updated to call the new function when it's removing all lines of a given type. The code handling lines with a specific key is unchanged as is that case it already has the key to look up in the dictionary and so doesn't need to hunt for it in the header record. --- vcf.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/vcf.c b/vcf.c index 6dc7aec32..9d4029dfc 100644 --- a/vcf.c +++ b/vcf.c @@ -886,6 +886,24 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } +static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + if (hrec->type == BCF_HL_FLT || + hrec->type == BCF_HL_INFO || + hrec->type == BCF_HL_FMT || + hrec->type == BCF_HL_CTG) { + int id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0 || !hrec->vals[id]) + return; + vdict_t *dict = (hrec->type == BCF_HL_CTG + ? (vdict_t*)hdr->dict[BCF_DT_CTG] + : (vdict_t*)hdr->dict[BCF_DT_ID]); + khint_t k = kh_get(vdict, dict, hrec->vals[id]); + if (k != kh_end(dict)) + kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL; + } +} + int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { // currently only for bcf_hdr_set_version @@ -980,6 +998,7 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); if (!new_hrec) { free(str.s); + bcf_hdr_unregister_hrec(hdr, hrec); return -1; } hdr->hrec = new_hrec; @@ -1184,18 +1203,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } hrec = hdr->hrec[i]; - - if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) - { - int j = bcf_hrec_find_key(hdr->hrec[i], "ID"); - if ( j>=0 ) - { - vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; - khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]); - kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; - } - } - + bcf_hdr_unregister_hrec(hdr, hrec); hdr->dirty = 1; hdr->nhrec--; if ( i < hdr->nhrec ) From 10f1516b75a9bb98dbf4a25c8dcfd01ff2d40975 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 6 Jul 2023 17:31:58 +0100 Subject: [PATCH 56/70] Remove items from hdict in bcf_hdr_remove() As bcf_hdr_remove() deletes hrec structs it needs to ensure that the pointers to them in the bcf_hdr_aux_t::gen dictionary are removed as well, otherwise callers to bcf_hdr_get_hrec() may get a stale pointer to the deleted item. Unfortunately bcf_hdr_remove_from_hdict() needs to allocate some memory to find the items in the dictionary. If that fails it falls back to a search through the dictionary values to find the item, so we can be sure that it will always succeed. Enhance tests to ensure bcf_hdr_get_hrec() returns NULL for removed records. --- test/test-vcf-api.c | 33 ++++++++++++++++++++++++++++++++- vcf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 51a9f41f6..f016f1b3f 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -125,6 +125,7 @@ void write_bcf(char *fname) check0(bcf_hdr_append(hdr, "##INFO=")); check0(bcf_hdr_append(hdr, "##FILTER=")); check0(bcf_hdr_append(hdr, "##unused=")); + check0(bcf_hdr_append(hdr, "##unused=")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 1")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 2")); check0(bcf_hdr_append(hdr, "##contig=")); @@ -297,12 +298,42 @@ void bcf_to_vcf(char *fname) if (!out) error("Couldn't open \"%s\" : %s\n", gz_fname, strerror(errno)); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); - bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); + if (!bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Missing header ##unused="); + bcf_hdr_remove(hdr_out,BCF_HL_STR,"BB"); + if (bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Got pointer to deleted header ##unused="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Missing header ##unused=unformatted text 1"); bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Got pointer to deleted header ##unused=unformatted text 1"); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Missing header ##FILTER="); bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Got pointer to deleted header ##FILTER="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_CTG,"ID","Unused",NULL)) + error("Missing header ##contig="); bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","Unused",NULL)) + error("Got pointer to header ##contig="); + if ( bcf_hdr_write(out, hdr_out)!=0 ) error("Failed to write to %s\n", fname); int r; while ((r = bcf_read1(fp, hdr, rec)) >= 0) diff --git a/vcf.c b/vcf.c index 9d4029dfc..9e589f993 100644 --- a/vcf.c +++ b/vcf.c @@ -904,6 +904,47 @@ static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } } +static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + kstring_t str = KS_INITIALIZE; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k; + int id; + + switch (hrec->type) { + case BCF_HL_GEN: + if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0) + str.l = 0; + break; + case BCF_HL_STR: + id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0) + return; + if (!hrec->vals[id] || + ksprintf(&str, "##%s=", hrec->key, hrec->vals[id]) < 0) + str.l = 0; + break; + default: + return; + } + if (str.l) { + k = kh_get(hdict, aux->gen, str.s); + } else { + // Couldn't get a string for some reason, so try the hard way... + for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) { + if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec) + break; + } + } + if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) { + kh_val(aux->gen, k) = NULL; + free((char *) kh_key(aux->gen, k)); + kh_key(aux->gen, k) = NULL; + kh_del(hdict, aux->gen, k); + } + free(str.s); +} + int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { // currently only for bcf_hdr_set_version @@ -1204,6 +1245,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) if ( hdr->hrec[i]->type!=type ) { i++; continue; } hrec = hdr->hrec[i]; bcf_hdr_unregister_hrec(hdr, hrec); + bcf_hdr_remove_from_hdict(hdr, hrec); hdr->dirty = 1; hdr->nhrec--; if ( i < hdr->nhrec ) @@ -1245,6 +1287,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) } if ( i==hdr->nhrec ) return; hrec = hdr->hrec[i]; + bcf_hdr_remove_from_hdict(hdr, hrec); } hdr->nhrec--; From 2e672f33a860e60e6ce42b77a07713558fb0507d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 10 Jul 2023 09:42:22 +0100 Subject: [PATCH 57/70] Fix decompress_peek_gz to cope with files starting on empty gzip blocks. --- hts.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index f2bc5fcb6..5ce425212 100644 --- a/hts.c +++ b/hts.c @@ -324,10 +324,24 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) zs.avail_out = destsize; if (inflateInit2(&zs, 31) != Z_OK) return -1; - while (zs.total_out < destsize) - if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; + int ret; + unsigned char *last_in = buffer; + while (zs.total_out < destsize) { + ret = inflate(&zs, Z_SYNC_FLUSH); + if (ret == Z_STREAM_END && zs.avail_in && zs.total_out < destsize) { + if (last_in == zs.next_in) + break; // paranoia to avoid potential looping + else + last_in = zs.next_in; + inflateReset(&zs); + continue; + } + if (ret != Z_OK) + break; + } - destsize = zs.total_out; + // zs.total_out can sometimes be wrong as inflateReset resets it + destsize = zs.next_out - dest; inflateEnd(&zs); return destsize; From a809db76f4fa9e1425599c9a0ca4999a1fd88b44 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 13 Jul 2023 14:42:35 +0100 Subject: [PATCH 58/70] Ensure number of modifications is always set in bam_parse_basemod2() Set state->nmods to 0 at the start, in case the function returns early. This ensures calls to bam_mods_at_next_pos() won't try to replay the mods from an earlier line. --- sam_mods.c | 1 + test/base_mods/MM-not-all-modded.out | 180 +++++++++++++++++++++++++++ test/base_mods/MM-not-all-modded.sam | 5 + test/base_mods/base-mods.tst | 3 + 4 files changed, 189 insertions(+) create mode 100644 test/base_mods/MM-not-all-modded.out create mode 100644 test/base_mods/MM-not-all-modded.sam diff --git a/sam_mods.c b/sam_mods.c index 3eb042328..fe8db85f7 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -231,6 +231,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, // Reset position, else upcoming calls may fail on // seq pos - length comparison state->seq_pos = 0; + state->nmods = 0; state->flags = flags; // Read MM and ML tags diff --git a/test/base_mods/MM-not-all-modded.out b/test/base_mods/MM-not-all-modded.out new file mode 100644 index 000000000..f7e3906a5 --- /dev/null +++ b/test/base_mods/MM-not-all-modded.out @@ -0,0 +1,180 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + diff --git a/test/base_mods/MM-not-all-modded.sam b/test/base_mods/MM-not-all-modded.sam new file mode 100644 index 000000000..0858c766d --- /dev/null +++ b/test/base_mods/MM-not-all-modded.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1b 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r2b 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index ff1f7651a..aca4cdea1 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -44,6 +44,9 @@ P MM-explicit-x.out $test_mod -x MM-explicit.sam # for sites known to be have been scanned. P MM-explicit-f.out $test_mod -f 1 MM-explicit.sam +# Ensure state gets reset correctly between reads +P MM-not-all-modded.out $test_mod MM-not-all-modded.sam + # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam From 5dc826f0bbe3b462f1fdcefb8fcfa6830ee7ecc9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 13 Jul 2023 14:14:46 +0100 Subject: [PATCH 59/70] Fix to 2e672f33 decompress_peek_gz change. The "zs.total_out < destsize" should have been "zs.avail_out" to be more robust to total_out being reset by inflateReset. However looking again neither the avail_in or avail_out checks are necessary, as once we hit the end of either input or output buffer the next cycle triggers ret == Z_BUF_ERROR and we drop out as normal. Thanks to John Marshall for the spot. --- hts.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hts.c b/hts.c index 5ce425212..9683948b0 100644 --- a/hts.c +++ b/hts.c @@ -326,21 +326,21 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) int ret; unsigned char *last_in = buffer; - while (zs.total_out < destsize) { + while (zs.avail_out > 0) { ret = inflate(&zs, Z_SYNC_FLUSH); - if (ret == Z_STREAM_END && zs.avail_in && zs.total_out < destsize) { + if (ret == Z_STREAM_END) { if (last_in == zs.next_in) - break; // paranoia to avoid potential looping + break; // Paranoia to avoid potential looping. Shouldn't happen else last_in = zs.next_in; inflateReset(&zs); - continue; - } - if (ret != Z_OK) + } else if (ret != Z_OK) { + // eg Z_BUF_ERROR due to avail_in/out becoming zero break; + } } - // zs.total_out can sometimes be wrong as inflateReset resets it + // NB: zs.total_out is changed by inflateReset, so use pointer diff instead destsize = zs.next_out - dest; inflateEnd(&zs); From 85d44d0c7f8edc4e0480654ef42790b7a20d59e8 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:06:48 +0100 Subject: [PATCH 60/70] Ensure simple_test_driver.sh cleans up its temporary files --- test/simple_test_driver.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/simple_test_driver.sh b/test/simple_test_driver.sh index 9ab412511..5bc020eb6 100644 --- a/test/simple_test_driver.sh +++ b/test/simple_test_driver.sh @@ -51,6 +51,7 @@ run_test() { else # Expected non-zero exit code and got it r="P" + rm -f _out.tmp _err.tmp fi elif [ "$p" = "N" ] then @@ -69,11 +70,12 @@ run_test() { # Output differed r="F" y="output" + rm -f _out.tmp2 fi else # Expected zero exit code and got it. r="P" - rm -f _out.tmp _out.tmp2 _err.tmp + rm -f _out.tmp _err.tmp fi if [ "$r" = "F" ] @@ -107,6 +109,7 @@ run_test() { ;; *) echo "XFAIL: $@" + rm -f _out.tmp _err.tmp nefail=`expr $nefail + 1` ;; esac From 64ae397c16c48274f25b4ea5a30ce65410ca32af Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:12:21 +0100 Subject: [PATCH 61/70] Ensure base mod test result is noticed by the Makefile * Remove line from base-mods.sh which caused it to always return success. * Explicitly check for non-zero return from base mod tests that are expected to work that way. --- test/base_mods/base-mods.sh | 1 - test/base_mods/base-mods.tst | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh index 388ff369e..f3f3ca4b7 100755 --- a/test/base_mods/base-mods.sh +++ b/test/base_mods/base-mods.sh @@ -31,6 +31,5 @@ test_mod="../test_mod" pileup_mod="../pileup_mod" test_driver $@ -rm _err.tmp _out.tmp exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index aca4cdea1..55d2d0034 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -23,6 +23,7 @@ # First field: # INIT = initialisation, not counted in testing # P = expected to pass +# N = expected to return non-zero # F = expected to fail # Second field: @@ -54,5 +55,5 @@ P MM-pileup2.out $pileup_mod < MM-pileup2.sam # Validation testing. We just care about exit status here, but the # test data is a copy of MM-pileup.sam so that suffices too. P MM-pileup.out $pileup_mod < MM-MNp.sam -F MM-pileup.out $pileup_mod < MM-MNf1.sam -F MM-pileup.out $pileup_mod < MM-MNf2.sam +N MM-pileup.out $pileup_mod < MM-MNf1.sam +N MM-pileup.out $pileup_mod < MM-MNf2.sam From b9e33b7be467614bd4601d162292afdbf541d109 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:15:59 +0100 Subject: [PATCH 62/70] Improve test/test_mod.c Stop test/test_mod from writing trailing whitespace, to make commiting the outputs easier for developers using pre-commit checks for whitespace issues. Remove array `out`, as it's contents were never used, or even set due to conditionals on argc. This also allows the output to be written directly to stdout instead of going through an intermediate buffer. --- test/base_mods/MM-chebi.out | 86 +++---- test/base_mods/MM-double.out | 86 +++---- test/base_mods/MM-explicit-f.out | 192 ++++++++-------- test/base_mods/MM-explicit-x.out | 176 +++++++-------- test/base_mods/MM-explicit.out | 176 +++++++-------- test/base_mods/MM-multi.out | 174 +++++++-------- test/base_mods/MM-not-all-modded.out | 320 +++++++++++++-------------- test/test_mod.c | 67 +++--- 8 files changed, 633 insertions(+), 644 deletions(-) diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index 8df8130df..89970ddf9 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -1,48 +1,48 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m102 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n212 -16 A -17 C C+m128 -18 G -19 C C+(76792)161 -20 C C+m153 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m179 -32 A -33 C -34 C C+m204 C+(76792)33 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m102 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n212 +16 A +17 C C+m128 +18 G +19 C C+(76792)161 +20 C C+m153 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 +32 A +33 C +34 C C+m204 C+(76792)33 +35 A --- Present: m. #-76792. n. -6 C C+m102 -15 N N+n212 -17 C C+m128 -19 C C+(76792)161 -20 C C+m153 -31 C C+m179 -34 C C+m204 C+(76792)33 +6 C C+m102 +15 N N+n212 +17 C C+m128 +19 C C+(76792)161 +20 C C+m153 +31 C C+m179 +34 C C+m204 C+(76792)33 === diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index e346192b8..431dfff07 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -1,48 +1,48 @@ -0 A -1 G G-m115 -2 G -3 A -4 T -5 C -6 T -7 C C+m128 -8 T -9 A -10 G -11 C -12 G G-m141 -13 G G-m166 G+o102 -14 A -15 T -16 C -17 G -18 G -19 C -20 G -21 G -22 G G-m192 -23 G -24 G -25 A -26 T -27 A -28 T -29 G -30 C C+m153 -31 C C+m179 -32 A -33 T -34 A -35 T +0 A +1 G G-m115 +2 G +3 A +4 T +5 C +6 T +7 C C+m128 +8 T +9 A +10 G +11 C +12 G G-m141 +13 G G-m166 G+o102 +14 A +15 T +16 C +17 G +18 G +19 C +20 G +21 G +22 G G-m192 +23 G +24 G +25 A +26 T +27 A +28 T +29 G +30 C C+m153 +31 C C+m179 +32 A +33 T +34 A +35 T --- Present: m. m. o. -1 G G-m115 -7 C C+m128 -12 G G-m141 -13 G G-m166 G+o102 -22 G G-m192 -30 C C+m153 -31 C C+m179 +1 G G-m115 +7 C C+m128 +12 G G-m141 +13 G G-m166 G+o102 +22 G G-m192 +30 C C+m153 +31 C C+m179 === diff --git a/test/base_mods/MM-explicit-f.out b/test/base_mods/MM-explicit-f.out index 6462c99b9..0f7326cd8 100644 --- a/test/base_mods/MM-explicit-f.out +++ b/test/base_mods/MM-explicit-f.out @@ -1,111 +1,111 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C -14 C C+m160 C+h20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C C+m# C+h# -3 A -4 T -5 C C+m# C+h# -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C C+m10 C+h5 -14 C C+m160 C+h20 -15 G -16 C C+m10 C+h5 -17 T -18 A -19 T -20 A -21 G -22 C C+m# C+h# -23 C C+m# C+h# -24 T +0 A +1 T +2 C C+m# C+h# +3 A +4 T +5 C C+m# C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+m# C+h# +23 C C+m# C+h# +24 T --- Present: m? h? -2 C C+m# C+h# -5 C C+m# C+h# -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 -22 C C+m# C+h# -23 C C+m# C+h# +2 C C+m# C+h# +5 C C+m# C+h# +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 +22 C C+m# C+h# +23 C C+m# C+h# === -0 A -1 T -2 C C+h# -3 A -4 T -5 C C+h# -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+h170 -11 T -12 A -13 C C+h5 -14 C C+m160 C+h20 -15 G -16 C C+h5 -17 T -18 A -19 T -20 A -21 G -22 C C+h# -23 C C+h# -24 T +0 A +1 T +2 C C+h# +3 A +4 T +5 C C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+h# +23 C C+h# +24 T --- Present: m. h? -2 C C+h# -5 C C+h# -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 -22 C C+h# -23 C C+h# +2 C C+h# +5 C C+h# +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 +22 C C+h# +23 C C+h# === diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out index 4078543ca..8acfbf2fe 100644 --- a/test/base_mods/MM-explicit-x.out +++ b/test/base_mods/MM-explicit-x.out @@ -1,103 +1,103 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m.200 C+h.10 -10 C C+m.50 C+h.170 -11 T -12 A -13 C -14 C C+m.160 C+h.20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m?200 C+h?10 -10 C C+m?50 C+h?170 -11 T -12 A -13 C C+m?10 C+h?5 -14 C C+m?160 C+h?20 -15 G -16 C C+m?10 C+h?5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m? h? -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m.200 C+h?10 -10 C C+h?170 -11 T -12 A -13 C C+h?5 -14 C C+m.160 C+h?20 -15 G -16 C C+h?5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h? -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 === diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index 186e790ae..0f3701fcd 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -1,103 +1,103 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C -14 C C+m160 C+h20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C C+m10 C+h5 -14 C C+m160 C+h20 -15 G -16 C C+m10 C+h5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m? h? -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+h170 -11 T -12 A -13 C C+h5 -14 C C+m160 C+h20 -15 G -16 C C+h5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h? -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 === diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index 73b480b80..41054a7c0 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -1,97 +1,97 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m77 C+h159 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n240 -16 A -17 C C+m103 C+h133 -18 G -19 C C+m128 C+h108 -20 C C+m154 C+h82 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m179 C+h57 -32 A -33 C -34 C C+m204 C+h31 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m77 C+h159 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n240 +16 A +17 C C+m103 C+h133 +18 G +19 C C+m128 C+h108 +20 C C+m154 C+h82 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 C+h57 +32 A +33 C +34 C C+m204 C+h31 +35 A --- Present: m. h. n. -6 C C+m77 C+h159 -15 N N+n240 -17 C C+m103 C+h133 -19 C C+m128 C+h108 -20 C C+m154 C+h82 -31 C C+m179 C+h57 -34 C C+m204 C+h31 +6 C C+m77 C+h159 +15 N N+n240 +17 C C+m103 C+h133 +19 C C+m128 C+h108 +20 C C+m154 C+h82 +31 C C+m179 C+h57 +34 C C+m204 C+h31 === diff --git a/test/base_mods/MM-not-all-modded.out b/test/base_mods/MM-not-all-modded.out index f7e3906a5..64fc847e2 100644 --- a/test/base_mods/MM-not-all-modded.out +++ b/test/base_mods/MM-not-all-modded.out @@ -1,178 +1,178 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N -16 A -17 C -18 G -19 C -20 C -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C -32 A -33 C -34 C -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A --- Present: === -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N -16 A -17 C -18 G -19 C -20 C -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C -32 A -33 C -34 C -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A --- Present: diff --git a/test/test_mod.c b/test/test_mod.c index e59da4827..89fb54243 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -86,7 +86,6 @@ static char *code(int id) { } int main(int argc, char **argv) { - char out[1024] = {0}; int extended = 0; uint32_t flags = 0; @@ -126,10 +125,9 @@ int main(int argc, char **argv) { int i, j, n; hts_base_mod mods[5]; for (i = 0; i < b->core.l_qseq; i++) { - char line[8192], *lp = line, *ep = line + sizeof(line); + char sp = '\t'; n = bam_mods_at_next_pos(b, m, mods, 5); - lp += snprintf(lp, ep - lp, "%d\t%c\t", - i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + printf("%d\t%c", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { char qstr[10]; if (mods[j].qual == HTS_MOD_UNCHECKED) @@ -149,30 +147,25 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += snprintf(lp, ep - lp, "%c%c%s%c%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - "?."[m_implicit], - qstr); + printf("%c%c%c%s%c%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + qstr); } else { - lp += snprintf(lp, ep - lp, "%c%c%s%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - qstr); + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); } + sp = ' '; } - *lp++ = '\n'; - *lp++ = 0; - - if (argc > 1) - printf("%s", line); - else - strcat(out, line); + putchar('\n'); } - if (argc > 1) puts("---"); + puts("---"); bam_parse_basemod2(b, m, flags); @@ -192,9 +185,9 @@ int main(int argc, char **argv) { int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { - char line[8192]={0}, *lp = line, *ep = line + sizeof(line); - lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, - seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + char sp = '\t'; + printf("%d\t%c", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { char qstr[10]; if (mods[j].qual == HTS_MOD_UNCHECKED) @@ -204,24 +197,20 @@ int main(int argc, char **argv) { else snprintf(qstr, 10, "%d", mods[j].qual); - lp += snprintf(lp, ep - lp, "%c%c%s%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - qstr); + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); + sp = ' '; } - *lp++ = '\n'; - *lp++ = 0; - - if (argc > 1) - printf("%s", line); - else - strcat(out, line); + putchar('\n'); } + if (n < 0) goto err; - if (argc > 1) puts("\n===\n"); + puts("\n===\n"); } fflush(stdout); if (sam_close(in) != 0 || r < -1) From 27007583af8884582b57a23d5aa43a5665928de1 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 19 Jul 2023 23:53:23 +1200 Subject: [PATCH 63/70] Add missing Makefile dependencies [minor] --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 83b49609d..4ed5f1086 100644 --- a/Makefile +++ b/Makefile @@ -478,7 +478,7 @@ textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstr cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) -cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) +cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) @@ -735,17 +735,17 @@ htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) +htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c config.h $(htscodecs_arith_dynamic_h) htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) +htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) +htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c config.h $(htscodecs_rANS_static4x16_h) htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) +htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c config.h $(htscodecs_rANS_static_h) htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) +htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c config.h $(htscodecs_tokenise_name3_h) htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) +htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c config.h $(htscodecs_varint_h) test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) @@ -755,7 +755,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/pileup_mod.o: test/pileup_mod.c config.h $(htslib_sam_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) -test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) +test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h) test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) @@ -770,7 +770,7 @@ test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) -test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) +test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c From 6285a68f81464a5b96639034cf6aa2c49069f920 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 20 Jul 2023 00:13:55 +1200 Subject: [PATCH 64/70] Make last_in a pointer to const [minor] Zlib can be configured such that zs.next_in is a pointer to const. --- hts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hts.c b/hts.c index 9683948b0..b7b528a61 100644 --- a/hts.c +++ b/hts.c @@ -325,7 +325,7 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) if (inflateInit2(&zs, 31) != Z_OK) return -1; int ret; - unsigned char *last_in = buffer; + const unsigned char *last_in = buffer; while (zs.avail_out > 0) { ret = inflate(&zs, Z_SYNC_FLUSH); if (ret == Z_STREAM_END) { From 89141250d1c5262b2cfc5d41b05d1a30f2c7092e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 20 Jul 2023 11:42:58 +0100 Subject: [PATCH 65/70] Switch to htscodecs 1.5.1 * Trivial bug fix to tests/r4x16pr demo * Updates for 1.5.1 release --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 109f06949..11b5007ff 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 109f069490fca15d85e2d261822c15bc3080db8a +Subproject commit 11b5007ffb68bea9f6c777874a215e4187ce659a From 2051536d4ccc99690db292bd9858f2a455e0fba8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 13 Jul 2023 09:52:07 +0100 Subject: [PATCH 66/70] Add fai_thread_pool interface. This is a simple skim around bgzf_thread_pool. Also added a missing @param qsize to the bgzf_thread_pool documentation. Fixes #1638 --- faidx.c | 5 +++++ htslib/bgzf.h | 2 ++ htslib/faidx.h | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/faidx.c b/faidx.c index 2eb0f3edc..5dd4bf1c0 100644 --- a/faidx.c +++ b/faidx.c @@ -975,6 +975,11 @@ void fai_set_cache_size(faidx_t *fai, int cache_size) { bgzf_set_cache_size(fai->bgzf, cache_size); } +// Adds a thread pool to the underlying BGZF layer. +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) { + return bgzf_thread_pool(fai->bgzf, pool, qsize); +} + char *fai_path(const char *fa) { char *fai = NULL; if (!fa) { diff --git a/htslib/bgzf.h b/htslib/bgzf.h index cb789ad53..97788a6fe 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -322,6 +322,8 @@ typedef struct BGZF BGZF; * * @param fp BGZF file handler * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. */ HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); diff --git a/htslib/faidx.h b/htslib/faidx.h index c1b3090a5..c3a4c95f6 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -70,6 +70,9 @@ struct faidx_t; /// Opaque structure representing FASTA index typedef struct faidx_t faidx_t; +/// Opaque structure; sole item needed from htslib/thread_pool.h +struct hts_tpool; + /// File format to be dealing with. enum fai_format_options { FAI_NONE, @@ -357,6 +360,15 @@ int fai_adjust_region(const faidx_t *fai, int tid, HTSLIB_EXPORT void fai_set_cache_size(faidx_t *fai, int cache_size); +/// Adds a thread pool to the underlying BGZF layer. +/** @param fai FAI file handler + * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. + */ +HTSLIB_EXPORT +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize); + /// Determines the path to the reference index file /** @param fa String with the path to the reference file * @return String with the path to the reference index file, or NULL on failure From 4dbb9913f7d535b85383c4257df8b1a7301bfbd8 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 20 Jul 2023 16:01:23 +0100 Subject: [PATCH 67/70] Summer 2023 copyright update. --- bgzf.c | 2 +- configure.ac | 2 +- cram/cram_decode.c | 2 +- cram/cram_index.c | 2 +- cram/cram_structs.h | 2 +- hfile_s3.c | 2 +- hts_probe_cc.sh | 2 +- htslib/bgzf.h | 2 +- htslib/faidx.h | 2 +- htslib/kseq.h | 2 +- htslib/vcf.h | 2 +- probaln.c | 2 +- tbx.c | 2 +- test/base_mods/base-mods.tst | 2 +- test/test-bcf-sr.c | 2 +- test/test-bcf-sr.pl | 2 +- test/test-regidx.c | 2 +- test/test-vcf-api.c | 2 +- test/test.pl | 2 +- test/test_bgzf.c | 2 +- test/test_mod.c | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/bgzf.c b/bgzf.c index 468289106..45f2b1150 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2021 Genome Research Ltd + Copyright (C) 2009, 2013-2022 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/configure.ac b/configure.ac index 3230f3935..c1afb38e7 100644 --- a/configure.ac +++ b/configure.ac @@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2020-2021 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 47b7ed076..26c7c1fac 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_index.c b/cram/cram_index.c index 39bc7cae0..b775e9431 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2020 Genome Research Ltd. +Copyright (c) 2013-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 15b7f145b..160663392 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd. +Copyright (c) 2012-2016, 2018-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/hfile_s3.c b/hfile_s3.c index 2ce7feb4b..e2718f656 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2022 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2023 Genome Research Ltd. Author: John Marshall diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 5e5ddec1e..71c6f5d01 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -2,7 +2,7 @@ # Check compiler options for non-configure builds and create Makefile fragment # -# Copyright (C) 2022 Genome Research Ltd. +# Copyright (C) 2022-2023 Genome Research Ltd. # # Author: Rob Davies # diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 97788a6fe..ea4ec3ece 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/htslib/faidx.h b/htslib/faidx.h index c3a4c95f6..4351b3fbe 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022-2023 Genome Research Ltd. Author: Heng Li diff --git a/htslib/kseq.h b/htslib/kseq.h index 5d573d3d9..ea887f14c 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos - Copyright (C) 2013, 2018, 2020 Genome Research Ltd. + Copyright (C) 2013, 2018, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/htslib/vcf.h b/htslib/vcf.h index 8bbf480c8..83659ae12 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020, 2022 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. Author: Heng Li diff --git a/probaln.c b/probaln.c index c841c7522..b42f85685 100644 --- a/probaln.c +++ b/probaln.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li - Copyright (C) 2016-2017, 2020 Genome Research Ltd. + Copyright (C) 2016-2017, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/tbx.c b/tbx.c index 154e2a81b..c2c5c6f9d 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 55d2d0034..889c3780e 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Genome Research Ltd. +# Copyright (C) 2020, 2023 Genome Research Ltd. # # Author: James Bonfield # diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index e91af7f28..80daf0423 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017, 2020 Genome Research Ltd. + Copyright (C) 2017, 2020, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test-bcf-sr.pl b/test/test-bcf-sr.pl index 2e290cb3a..5c32e00f4 100755 --- a/test/test-bcf-sr.pl +++ b/test/test-bcf-sr.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # test-bcf-sr.pl -- Test bcf synced reader's allele pairing # -# Copyright (C) 2017-2018, 2020 Genome Research Ltd. +# Copyright (C) 2017-2018, 2020, 2023 Genome Research Ltd. # # Author: petr.danecek@sanger # diff --git a/test/test-regidx.c b/test/test-regidx.c index 4cad440c7..4ba623734 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -2,7 +2,7 @@ gcc -g -Wall -O0 -I. -I../htslib/ -L../htslib regidx.c -o test-regidx test-regidx.c -lhts - Copyright (C) 2014,2016,2018, 2020 Genome Research Ltd. + Copyright (C) 2014,2016,2018, 2020, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index f016f1b3f..eff653686 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -1,6 +1,6 @@ /* test/test-vcf-api.c -- VCF test harness. - Copyright (C) 2013, 2014, 2017-2021 Genome Research Ltd. + Copyright (C) 2013, 2014, 2017-2021, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test.pl b/test/test.pl index f8e94faa3..566e7cfce 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2022 Genome Research Ltd. +# Copyright (C) 2012-2023 Genome Research Ltd. # # Author: Petr Danecek # diff --git a/test/test_bgzf.c b/test/test_bgzf.c index a5084e6c6..6cb6db902 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -1,6 +1,6 @@ /* test/test_bgzf.c -- bgzf unit tests - Copyright (C) 2017, 2019 Genome Research Ltd + Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd Author: Robert Davies diff --git a/test/test_mod.c b/test/test_mod.c index 89fb54243..d8a53f3de 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -1,6 +1,6 @@ /* test/test_mod.c -- testing of base modification functions - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2021, 2023 Genome Research Ltd. Author: James Bonfield From d06b5988058e49301178d1784b3dc077bde34000 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 3 Mar 2023 11:55:32 +0000 Subject: [PATCH 68/70] Demonstration of htslib/sam api usage. The README.md file is a basic introduction to the sample code in this directory, while DEMO.md has more detailed information per program and API calls introduced. This is a work in progress and updated documentation may be available on www.htslib.org. --- samples/DEMO.md | 1432 +++++++++++++++++++++++++++++++++ samples/Makefile | 106 +++ samples/README.md | 229 ++++++ samples/add_header.c | 128 +++ samples/cram.c | 168 ++++ samples/dump_aux.c | 188 +++++ samples/flags_demo.c | 110 +++ samples/flags_htsopt_field.c | 115 +++ samples/index_multireg_read.c | 150 ++++ samples/index_reg_read.c | 143 ++++ samples/index_write.c | 166 ++++ samples/mod_aux.c | 221 +++++ samples/mod_aux_ba.c | 147 ++++ samples/mod_bam.c | 229 ++++++ samples/modstate.c | 190 +++++ samples/mpileup.c | 204 +++++ samples/pileup.c | 183 +++++ samples/pileup_mod.c | 218 +++++ samples/read_aux.c | 207 +++++ samples/read_bam.c | 139 ++++ samples/read_fast.c | 116 +++ samples/read_header.c | 173 ++++ samples/read_refname.c | 125 +++ samples/rem_header.c | 138 ++++ samples/sample.ref.fa | 4 + samples/sample.sam | 29 + samples/split.c | 153 ++++ samples/split2.c | 158 ++++ samples/split_thread1.c | 161 ++++ samples/split_thread2.c | 171 ++++ samples/update_header.c | 131 +++ samples/write_fast.c | 101 +++ 32 files changed, 6133 insertions(+) create mode 100644 samples/DEMO.md create mode 100644 samples/Makefile create mode 100644 samples/README.md create mode 100644 samples/add_header.c create mode 100644 samples/cram.c create mode 100644 samples/dump_aux.c create mode 100644 samples/flags_demo.c create mode 100644 samples/flags_htsopt_field.c create mode 100644 samples/index_multireg_read.c create mode 100644 samples/index_reg_read.c create mode 100644 samples/index_write.c create mode 100644 samples/mod_aux.c create mode 100644 samples/mod_aux_ba.c create mode 100644 samples/mod_bam.c create mode 100644 samples/modstate.c create mode 100644 samples/mpileup.c create mode 100644 samples/pileup.c create mode 100644 samples/pileup_mod.c create mode 100644 samples/read_aux.c create mode 100644 samples/read_bam.c create mode 100644 samples/read_fast.c create mode 100644 samples/read_header.c create mode 100644 samples/read_refname.c create mode 100644 samples/rem_header.c create mode 100644 samples/sample.ref.fa create mode 100644 samples/sample.sam create mode 100644 samples/split.c create mode 100644 samples/split2.c create mode 100644 samples/split_thread1.c create mode 100644 samples/split_thread2.c create mode 100644 samples/update_header.c create mode 100644 samples/write_fast.c diff --git a/samples/DEMO.md b/samples/DEMO.md new file mode 100644 index 000000000..1f14b7485 --- /dev/null +++ b/samples/DEMO.md @@ -0,0 +1,1432 @@ +# HTS API + +## HTSLib APIs and samtools + +HTSLib is a C library implementation used to access and process the genome +sequence data. HTSLib implements multiple API interfaces, HTS API, VCF API and +SAM API. HTS API provides a framework for use by other APIs and applications, +implements bgzf compression, htscodecs and provides CRAM format support. VCF +APIs work with variant data in VCF and BCF format. + +SAM API works with sequence data of different formats, SAM / BAM / CRAM / +FASTA / FASTQ, and provides methods to do operations on the data. It uses +methods from HTS API. + +'samtools' is the utility used to read and modify sequence data. It uses SAM +APIs from HTSLib to work on the sequence data. + + +## About this document + +There are a number of demonstration utilities and their source code in +'samples' directory of HTSLib and this document gives the description of them +and the usage of API of HTSLib. The samples are for demonstration +purposes only and proper error handling is required for actual usage. This +document is based on HTSLib version 1.17. + +Updates to this document may be made along with later releases when required. + + +## The sample apps + +Flags - This application showcases the basic read of alignment files and flag +access. It reads and shows the count of read1 and read2 alignments. + +Split - This application showcases the basic read and write of alignment data. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. + +Split2 - This application showcases the output file format selection. It saves +the read1 and read2 as separate files in given directory, both as compressed +sam though the extensions are different. + +Cram - This application showcases the different way in which cram reference +data is used for cram output creation. + +Read_fast - This application showcases the fasta/fastq data read. + +Read_header - This application showcases the read and access of header data. +It can show all header line of given type, data of a given tag on a specific +header line or for all lines of given type. + +Read_ref - This application showcases the read and access of header data. +It shows all reference names which has length equal or greater to given input. + +Read_bam - This application showcases read of different alignment data fields. +It shows contents of each alignment. + +Read_aux - This application showcases read of specific auxiliary tag data in +alignment. It shows the data retrieved using 2 APIs, one as a string with tag +data and other as raw data alternatively. + +Dump_aux - This application showcases read of all auxiliary tag data one by one +in an alignment. It shows the data retrieved. + +Add_header - This application showcases the write of header lines to a file. +It adds header line of types, SQ, RG, PG and CO and writes to standard output. + +Remove_header - This application showcases removal of header line from a file. +It removes either all header lines of given type or one specific line of given +type with given unique identifier. Modified header is written on standard +output. + +Update_header - This application shows the update of header line fields, where +update is allowed. It takes the header line type, unique identifier for the +line, tag to be modified and the new value. Updated data is written on standard +output. + +Mod_bam - This application showcases the update of alignment data. It takes +alignment name, position of field to be modified and new value of it. +Modified data is written on standard output. + +Mod_aux - This application showcases the update of auxiliary data in alignment. +It takes alignment name, tag to be modified, its type and new value. Modified +data is written on standard output. + +Mod_aux_ba - This application showcases the update of auxiliary array data in +alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I. +Modified data is written on standard output. + +Write_fast - This application showcases the fasta/fastq data write. It appends +a dummy data to given file. + +Index_write - This application showcases the creation of index along with +output creation. Based on file type and shift, it creates bai, csi or crai +files. + +Read_reg - This application showcases the usage of region specification in +alignment read. + +Read_multireg - This application showcases the usage of mulitple regionn +specification in alignment read. + +Pileup - This application showcases the pileup api, where all alignments +covering a reference position are accessed together. It displays the bases +covering each position on standard output. + +Mpileup - This application showcases the mpileup api, which supports multiple +input files for pileup and gives a side by side view of them in pileup format. +It displays the bases covering each position on standard output. + +Modstate - This application showcases the access of base modifications in +alignment. It shows the modifications present in an alignment and accesses them +using available APIs. There are 2 APIs and which one to be used can be selected +through input. + +Pileup_mod - This application showcases the base modification access in pileup +mode. It shows the pileup display with base modifications. + +Flags_field - This application showcases the read of selected fields alone, +reducing the overhead / increasing the performance. It reads the flag field +alone and shows the count of read1 and read2. This has impact only on CRAM +files. + +Split_thread1 - This application showcases the use of threads in file handling. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. 2 threads are used for read and 1 each dedicated for each +output file. + +Split_thread2 - This application showcases the use of thread pool in file +handling. It saves the read1 and read2 as separate files in given directory, +one as sam and other as bam. A pool of 4 threads is created and shared for both +read and write. + + +## Building the sample apps + +The samples expect the HTSLib is installed, libraries and header file path are +part of the PATH environment variable. If not, these paths need to be explicitly +passed during the build time. + +Gcc and compatible compilers can be used to build the samples. + +These applications can be linked statically or dynamically to HTSLib. +For static linking, along with htslib other libraries and/or headers required +to build are, math, pthread, curl, lzma, z and bz2 libraries. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + + +## Usage of HTS APIs +### Sequence data file access for read + +The sequence data file for read may be opened using the sam_open method. It +opens the file and returns samFile (htsFile) pointer on success or NULL on +failure. The input can be path to a file in disk, network, cloud or '-' +designating the standard input. + +SAM, BAM and CRAM file formats are supported and the input file format is +detected from the file content. + +Once done with the file, it needs to be closed with sam_close. + +Many times, header details would be required and can be read using +sam_hdr_read api. It returns sam_hdr_t pointer or NULL. The returned header +needs to be destroyed using sam_hdr_destroy when no longer required. + +The sequence data may be compressed or uncompressed on disk and on memory it +is read and kept as uncompressed BAM format. It can be read from a file using +sam_read1 api. samFile pointer, header and bam storage are to be passed as +argument and it returns 0 on success, -1 on end of file and < -1 in case of +errors. + +The bam storage has to be initialised using bam_init1 api before the call and +can be reused for successive reads. Once done, it needs to be destroyed using +bam_destroy1. The member field named core - bam1_core_t - in bam storage, +bam1_t, has the sequence data in an easily accessible way. Using the fields +and macros, data can easily be read from it. + + #include + + int main(int argc, char *argv[]) + { + ... + //initialize + if (!(bamdata = bam_init1())) { + ... + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + ... + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + ... + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; + } +Refer: flags_demo.c + +This shows the count of read1 and read2 alignments. + + ./flags /tmp/sample.sam.gz + +To read CRAM files, reference data is required and if it is not available, based +on configuration, library may try to download it from external repositories. + + +### Sequence data file access for write + +File access for write is similar to read with a few additional optional steps. + +The output file can be opened using sam_open api as in read, with "w" instead +of "r" as mode. This opens the file for writing and uses mode to select the +output file type. "w" alone denotes SAM, "wb" denotes BAM and "wc" denotes CRAM. + +Another way is to use sam_open_mode method, which sets the output file type and +compression based on the file name and explicit textual format specification. +This method expects a buffer to append type and compression flags. Usually a +buffer with standard file open flag is used, the buffer past the flag is passed +to the method to ensure existing flags and updates from this method are present +in the same buffer without being overwritten. This method will add more flags +indicating file type and compression based on name. If explicit format detail +given, then extension is ignored and the explicit specification is used. This +updated buffer can be used with sam_open to select the file format. + +sam_open_format method may also be used to open the file for output as more +information on the output file can be specified using this. Can use +mode buffer from sam_open_mode api or explicit format structure for this. + +The header data can be written using the sam_hdr_write api. When the header +data is copied to another variable and has different lifetime, it is good to +increase the reference count of the header using sam_hdr_incr_ref and +sam_hdr_destroy called as many times as required. + +The alignment data can be written using the sam_write1 api. It takes a samFile +pointer, header pointer and the alignment data. The header data is required to +set the reference name in the alignment. It returns -ve value on error. + + int main(int argc, char *argv[]) + { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || + (sam_hdr_write(outfile2, in_samhdr) == -1)) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + ... + } +Refer: split.c + +This creates 1.sam and 2.bam in /tmp/ containing read1 and read2 respectively. + + ./split /tmp/sample.sam.gz /tmp/ + +Below code excerpt shows sam_open_mode api usage. + + int main(int argc, char *argv[]) + { + ... + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || + (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + ... + } +Refer: split2.c + +This creates 1.sam.gz and 2.sam in /tmp/ both having compressed data. + + ./split2 /tmp/sample.sam.gz /tmp/ + +An htsFormat structure filled appropriately can also be used to specify output +file format while using sam_open_format api. + + +### CRAM writing + +CRAM files uses reference data and compresses alignment data. A CRAM file may +be created with external reference data file - most appropriate, with embedded +reference in it or with no reference data at all. It can also be created using +an autogenerated reference, based on consensus with-in the alignment data. +The reference detail can be set to an htsFormat structure using hts_parse_format +api and used with sam_open_format api to create appropriate CRAM file. + ... + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + ... + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get + reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + ... + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + ... +Refer: cram.c + + +### FASTA/FASTQ data access + +FASTA/FASTQ files have the raw sequence data and the data can be read one by +one using sam_read1 or a selected range using a region. The data can be written +similar to alignment data using sam_write1 api. To write the file, format +can be set by updating mode buffer using sam_open_mode with file name +or explicit format text. This mode buffer can be used with sam_open or can be +used with sam_open_format with explicit format information in htsFormat +structure. + + ... + if (!(bamdata = bam_init1())) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + ... +Refer: read_fast.c + + ... + char mode[4] = "a"; + ... + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + ... + if (!(outfile = sam_open(outname, mode))) { + ... + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) + < 0) { + ... + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + ... +Refer: write_fast.c + + +### Header data read + +The header gives the version, reference details, read group, change history +and comments. These data are stored inside the sam_hdr_t. Each of these +entries, except comments, have their unique identifier and it is required to +access different fields of them. The api sam_hdr_count_lines gives the count +of the specified type of header line. The value of a unique identifier to a +specific type of header line can be retrieved with sam_hdr_line_name api. The +api sam_hdr_find_tag_id and sam_hdr_find_tag_pos can get the field data from a +header line using unique identifier values or using position. The full header +line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with +position and unique identifier values respectively. + + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + ... + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + ... + linecnt = sam_hdr_count_lines(in_samhdr, header); + ... + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + ... + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + ... +Refer: read_header.c + +This will show the VN tag's value from HD header. + + ./read_header /tmp/sample.sam.gz HD VN + +Shows the 2nd SQ line's LN field value. + + ./read_header /tmp/sample.sam.gz SQ SN T2 LN + +Below code excerpt shows the reference names which has length above given value. + ... + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + ... + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + ... + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { + //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + ... + printf("%d,%s,%s\n", pos, id, data.s); + ... +Refer: read_refname.c + + +### Alignment data read + +The alignment / sequence data contains many fields. Mainly the read/query +name, flags indicating the properties of the read, reference sequence name, +position in reference to which it matches, quality of the read, CIGAR string +indicating the match status, position of mate / reverse strand, name of +reference sequence to which mate matches, the insert length, base sequence, +quality value of each base and auxiliary fields. + +Header data would be required to retrieve the reference names as alignment +contains the position of the reference in the header. + +A few of the data are directly visible in bam1_t and the rest are hidden +inside data member of bam1_t and can easily be retrieved using macros. +bam_get_qname gives the name of the read, sam_hdr_tid2name gives the reference +name. bam_get_cigar retrieves the cigar operation array, which can be decoded +using bam_cigar_oplen to get count of bases to which that operation applicable +and bam_cigar_opchr to get the cigar operation. bam_seqi retrieves the base +data at a given position in alignment and it can be converted to character by +indexing the seq_nt16_str array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE] + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + ... + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + ... + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation + and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + data = bam_get_seq(bamdata); + //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + ... + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + ... + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + ... +Refer: read_bam.c + +Shows the data from alignments. + + ./read_bam /tmp/sample.sam.gz + + +### Aux data read + +Auxiliary data gives extra information about the alignment. There can be a +number of such data and can be accessed by specifying required tag or by +iterating one by one through them once the alignment is read as bam1_t. The +auxiliary data are stored along with the variable length data in the data +field of bam1_t. There are macros defined to retrieve information about +auxiliary data from the data field of bam1_t. + +Data for a specific tag can be retrieved as a string or can be retrieved as raw +data. bam_aux_get_str retrieves as a string, with tag name, tag type and data. +bam_aux_get can get raw data and with bam_aux_type and bam_aux2A, bam_aux2f etc. +the raw data can be extracted. + +To iterate through all data, the start of aux data is retrieved using macro +bam_aux_first and successive ones using bam_aux_next. Macro bam_aux_tag gives +the tag of the aux field and bam_aux_type gives the information about type of +the aux field. + +Bam_aux2i, bam_aux2f, bam_aux2Z macros retrieve the aux data's value as +integer, float and string respectively. The integer value may be of different +precision / size and the bam_aux_type character indicates how to use the +value. The string/hex data are NULL terminated. + +For array data, bam_aux_type will return 'B' and bam_auxB_len gives the length +of the array. bam_aux_type with the next byte will give the type of data in +the array. bam_auxB2i, bam_auxB2f will give integer and float data from a +given position of the array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + ... + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + ... + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... +Refer: read_aux.c + +Shows the MD aux tag from alignments. + + ./read_aux ../../samtools/test/mpileup/mpileup.1.bam MD + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); + //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... + data = bam_aux_next(bamdata, data); //get the next aux data + ... +Refer: dump_aux.c + +Shows all the tags from all alignments. + + ./dump_aux ../../samtools/test/mpileup/mpileup.1.bam + + +### Add/Remove/Update header + +There are specific types of data that can be part of header data. They have +a tag from HD, SQ, RG, PG and CO. Fully formatted header lines, separated by new +line, can be added with sam_hdr_add_lines api. A single header line can be added +using sam_hdr_add_line api where the header type, tag and value pair are passed +as arguments, terminated with a NULL argument. The PG header lines are special +that they have a kind of linkage to previous PG lines. This linkage can be auto +generated by using sam_hdr_add_pg api which sets the 'PP' field used in linkage. +sam_hdr_write api does the write of the header data to file. + + ... + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + ... + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + ... + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + ... + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + ... + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: add_header.c + +Not all type of header data can be removed but where it is possible, either a +specific header line can be removed or all of a header type can be removed. To +remove a specific line, header type, unique identifier field tag and its value +to be used. To remove all lines of a type, header type and unique identifier +field tag are to be used. + + ... + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + ... + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + ... + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: rem_header.c + +Shows the file content after removing SQ line with SN 2. + ./rem_header ../../samtools/test/mpileup/mpileup.1.bam SQ 2 + +The unique identifier for the line needs to be found to update a field, though +not all types in the header may be modifiable. The api sam_hdr_update_line +takes the unique identifier for the header line type, its value, the field +which needs to be modified and the new value with which to modify it, followed +by a NULL. +e.g. To change LN field from 2000 to 2250 in SQ line with unique identifier SN +as 'chr1', sam_hdr_update_line( header, "SQ", "SN", "chr1", "LN", "2250", +NULL). To change PP field from ABC to DEF in PG line with ID APP.10, +sam_hdr_update_line( header, "PG", "ID", "APP.10", "PP", "DEF", NULL). + + ... + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + ... +Refer: update_header.c + +Shows new sam file with 2nd SQ line having length as 38. + + ./update_header /tmp/sample.sam.gz SQ T1 LN 38 + + +### Update alignment data + +Many of the bam data fields may be updated by setting new value to appropriate +field in bam1_core_t structure and for a few, creating a new bam1_t record would +be easier than update of existing record. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + ... + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + ... + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + ... + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, + bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, + bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), + (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + ... + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + ... + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + ... + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + ... + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + ... +Refer: mod_bam.c + +Shows data with RNAME modified to T2. + + ./mod_bam /tmp/sample.sam ITR1 3 T2 + +The auxiliary data in bam1_t structure can be modified using +bam_aux_update_float, bam_aux_update_int etc. apis. If the aux field is not +present at all, it can be appended using bam_aux_append. + + ... + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + //tag not present append + ... + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + ... + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + ... + if (bam_aux_update_float(bamdata, tag, atof(val))) { + ... + case 'C': + case 'S': + case 'I': + ... + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + ... + case 'Z': + ... + if (bam_aux_update_str(bamdata, tag, length, val)) { + ... + case 'A': + ... + //update the char data directly on buffer + *(data+1) = val[0]; + ... +Refer: mod_aux.c + +Shows the given record's MD tag set to Test. + + ./mod_aux samtools/test/mpileup/mpileup.1.bam ERR013140.6157908 MD Z Test + +The array aux fields can be updated using bam_aux_update_array api. + + ... + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + ... +Refer: mod_aux_ba.c + +Shows the records updated with an array of integers, containing count of ACGT +and N in that order. + + ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam + + +### Create an index + +Indexes help to read data faster without iterating sequentially through the +file. Indexes contain the position information about alignments and that they +can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI, +FAI etc. and are usually used with iterators. + +Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM, +and CRAM files can be indexed. CRAM files are indexed as .crai and the other two +can be indexed as .bai or .csi files. Each of these types have different +internal representations of the index information. Bai uses a fixed +configuration values where as csi has them dynamically updated based on the +alignment data. + +Indexes can be created either with save of alignment data or explicitly by +read of existing alignment file. + +To create index along with alignment write, the sam_idx_init api need to be +invoked before the start of alignment data write. This api takes the output +samFile pointer, header pointer, minimum shift and index file path. For BAI +index, the min shift has to be 0. + +At the end of write, sam_idx_save api need to be invoked to save the index. + + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + ... + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + ... + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + ... + if (sam_idx_save(outfile)) { +Refer:index_write.c + +Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. + + ./idx_on_write ../../samtools/test/mpileup/mpileup.1.bam 0 /tmp/ + +To create index explicitly on an existing alignment data file, the +sam_index_build api or its alike can be used. sam_index_build takes the +alignment file path, min shift for the index and creates the index file in +same path. The output name will be based on the alignment file format and min +shift passed. + +The sam_index_build2 api takes the index file path as well and gives more +control than the previous one. The sam_index_build3 api provides an option to +configure the number of threads in index creation. + + +### Read with iterators + +Index file helps to read required data without sequentially accessing the file +and are required to use iterators. The interested reference, start and end +position etc. are required to read data with iterators. With index and these +information, an iterator is created and relevant alignments can be accessed by +iterating it. + +The api sam_index_load and the like does the index loading. It takes input +samFile pointer and file path. It loads the index file based on the input file +name, from the same path and with implicit index file extension - cram file +with .crai and others with .bai. The sam_index_load2 api accepts explicit path +to index file, which allows loading it from a different location and explicit +extensions. The sam_index_load3 api supports download/save of the index +locally from a remote location. These apis returns NULL on failure and index +pointer on success. + +The index file path can be appended to alignment file path and used as well. +In this case the paths are expected to be separated by '##idx##'. + +The sam_iter_queryi or sam_iter_querys apis may be used to create an iterator +and sam_itr_next api does the alignment data retrieval. Along with retrieval +of current data, it advances the iterator to next relevant data. The +sam_iter_queryi takes the interested positions as numeric values and +sam_iter_querys takes the interested position as a string. + +With sam_iter_queryi, the reference id can be the 0 based index of reference +data, -2 for unmapped alignments, -3 to start read from beginning of file, -4 +to continue from current position, -5 to return nothing. Based on the +reference id given, alignment covering the given start and end positions will +be read with sam_iter_next api. + +With sam_iter_querys, the reference sequence is identified with the name and +interested positions can be described with start and end separated by '-' as +string. When sequence is identified as '.', it begins from the start of file +and when it is '*', unmapped alignments are read. Reference with [:], +:S, :S-E, :-E retrieves all data, all data covering position +S onwards, all data covering position S to E, all data covering upto position +E of reference with ID respectively on read using sam_iter_next. + +The index and iterator created are to be destroyed once the need is over. +sam_itr_destroy and hts_idx_destroy apis does this. + + ... + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + ... + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + ... + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + ... + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + ... +Refer:index_reg_read.c + +With sample.sam, region as \* will show alignments with name UNMAP2 and UNMAP3 + + ./read_reg /tmp/sample.sam.gz \* + +With region as \., it shows all alignments + + ./read_reg /tmp/sample.sam.gz \. + +With region as T1:1-4, start 1 and end 4 it shows nothing and with T1:1-5 it +shows alignment with name ITR1. + + ./read_reg /tmp/sample.sam.gz T1:1-5 + +With region as T2:30-100, it shows alignment with name ITR2M which refers the +reference data T2. + + ./read_reg /tmp/sample.sam.gz T2:30-100 + + +Multiple interested regions can be specified for read using sam_itr_regarray. +It takes index path, header, count of regions and region descriptions as array +of char array / string. This array passed need to be released by the user +itself. + + ... + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + ... + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + ... + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + ... +Refer:index_multireg_read.c + +With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1 +region from T2 (34 onwards), alignments with name A1, B1, A2 and ITR2M would +be shown. + + ./read_multireg /tmp/sample.sam.gz 2 T1:30-32,T2:34 + +To use numeric indices instead of textual regions, sam_itr_regions can be used. +It takes index file path, header, count of regions and an array of region +description (hts_reglist_t*), which has the start end positions as numerals. + +The index and iterators are to be destroyed using the sam_itr_destroy and +hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library +on iterator destroy. The regions array (array of char array/string) needs to be +destroyed by the user itself. + + +### Pileup and MPileup + +Pileup shows the transposed view of the SAM alignment data, i.e. it shows the +the reference positions and bases which cover that position through different +reads side by side. MPileup facilitates the piling up of multiple sam files +against each other and same reference at the same time. + +Mpileup has replaced the pileup. The input expects the data to be sorted by +position. + +Pileup needs to be initialized with bam_pileup_init method which takes pointer +to a method, which will be called by pileup to read data from required files, +and pointer to data which might be required for this read method to do the +read operation. It returns a pointer to the pileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +Bam_plp_constructor and bam_plp_destructor methods does the setup of +these methods in the pileup iterator. During invocation of these methods, the +pointer to data passed in the initialization is passed as well. If user want +to do any custom status handling or actions during load or unload, it can be +done in these methods. Alignment specific data can be created and stored in +an argument passed to the constructor and the same will be accessible during +pileup status return. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_plp_auto api to get the pileup status. It +returns the pileup status or NULL on end. During this all alignments are read +one by one, using the method given in initialization for data read, until one +for a new reference is found or all alignment covering a position is read. On +such condition, the pileup status is returned and the same continuous on next +bam_plp_auto call. The pileup status returned is an array for all positions +for which the processing is completed. Along with the result, the reference +index, position in reference data and number of alignments which covers this +position are passed. User can iterate the result array and get bases from each +alignment which covers the given reference position. The alignment specific +custom data which were created in constructor function will also be available +in the result. + +The bam_plp_auto api invokes the data read method to load an alignment and the +constructor method is invoked during the load. Once the end of alignment is +passed, it is removed from the processing and destructor method is invoked, +that user could do deallocations and custom actions as in load during this +time. The custom data passed during the initialization is passed to the +constructor and destructor methods during invocation. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_plp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_plp_destroy api. + + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + ... + if (plpiter) { + bam_plp_destroy(plpiter); + ... +Refer:pileup.c + +The read method may use a simple read or it could be an advanced read using +indices, iterators and region specifications based on the need. The constructor +method may create any custom data and store it in the pointer passed to it. The +same need to be released by use on destructor method. + +MPileup works same as the pileup and supports multiple inputs against the same +reference, giving side by side view of reference and alignments from different +inputs. + +MPileup needs to be initialized with bam_mpileup_init method which takes +pointer to a method, which will be called by pileup to read data from required +files, and an array of pointer to data which might be required for this read +method to do the read operation. It returns a pointer to the mpileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +bam_mplp_constructor and bam_mplp_destructor methods does the setup +of these methods in the pileup iterator. During invocation of these methods, +the pointer to data passed in the initialization is passed as well. If user +want to do any custom status handling or actions during load or unload, it can +be done on these methods. Alignment specific data can be created and +stored in the custom data pointer and the same will be accessible during +return of pileup status. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_mplp_auto api to get the pileup status. It +returns the pileup status. During this all alignments are read one by one, +using the method given in initialization for data read, until one for a new +reference is found or all alignment covering a position is read. On such +condition, the pileup status is returned and the same continuous on next +bam_mplp_auto call. + +The pileup status is returned through a parameter in the method itself, is an +array for all inputs, each containing array for positions on which the +processing is completed. Along with the result, the reference index, position +in reference data and number of alignments which covers this position are +passed. User can iterate the result array and get bases from each alignment +which covers the given reference position. The alignment specific custom data +which were created in constructor function will also be available in the +result. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_mplp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_mplp_destroy api. + + ... + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + ... + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + ... + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + ... + if (plp) { + free(plp); + ... +Refer:mpileup.c + +This sample takes multiple sam files and shows the pileup of data side by side. + + ./mpileup /tmp/mp.bam /tmp/mp.sam + + +### Base modifications + +The alignment data may contain base modification information as well. This +gives the base, modifications found, orientation in which it was found and the +quality for the modification. The base modification can be identified using +hts_parse_basemod api. It stores the modification details on hts_base_mod_state +and this has to be initialized using hts_base_mod_state_alloc api. + +Once the modifications are identified, they can be accessed through different +ways. bam_mods_recorded api gives the modifications identified for an alignment. +Modifications can be queried for each base position iteratively using +bam_mods_at_next_pos api. Check the returned value with buffer size to see +whether the buffer is big enough to retrieve all modifications. +Instead of querying for each position, the next modified position can be +directly retrieved directly using bam_next_basemod api. An alignment can be +queried to have a specific modification using bam_mods_query_type api. At the +end of processing, the state need to be released using hts_base_mod_state_free +api. + + ... + if (!(ms = hts_base_mod_state_alloc())) { + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + if (bam_parse_basemod(bamdata, ms)) { + ... + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + ... + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + ... + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand ? "-/bottom/reverse" : + "+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + ... + if (ms) { + hts_base_mod_state_free(ms); + ... +Refer:modstate.c + +The modification can be accessed in pileup mode as well. bam_mods_at_qpos gives +the modification at given pileup position. Insertion and deletion to the given +position with possible modification can be retrieved using bam_plp_insertion_mod +api. + + ... + int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; + } + + int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; + } + + int main(int argc, char *argv[]) + { + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + ... + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + ... + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + ... + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + ... +Refer:pileup_mod.c + + +### Read selected fields + +At times the whole alignment data may not be of interest and it would be +better to read required fields alone from the alignment data. CRAM file format +supports such specific data read and HTSLib provides an option to use this. +This can improve the performance on read operation. + +The hts_set_opt method does the selection of specified fields. There are flags +indicating specific fields, like SAM_FLAG, SAM_SEQ, SAM_QNAME, in alignment +data and a combination of flags for the required fields can be passed with +CRAM_OPT_REQUIRED_FIELDS to this api. + + ... + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + ... + //read header + in_samhdr = sam_hdr_read(infile); + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + ... +Refer: flags_htsopt_field.c + + +### Thread-pool to read / write + +The HTSLib api supports thread pooling for better performance. There are a few +ways in which this can be used. The pool can be made specific for a file or a +generic pool can be created and shared across multiple files. Another way to +use thread pool is to schedule tasks explicitly to queues which gets executed +using threads in pool. + +To have a thread pool specific for a file, hts_set_opt api can be used with the +file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool. +Closure of file releases the thread pool as well. To have a thread pool which +can be shared across different files, it needs to be initialized using +hts_tpool_init api, passing number of threads as argument. This thread pool can +be associated with a file using hts_set_opt api. The file pointer, +HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments +to api. The thread pool has to be released with hts_tpool_destroy. + +Below excerpt shows file specific thread pool, + ... + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } +Refer: split_thread1.c + +Below excerpt shows thread pool shared across files, + ... + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + ... + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + ... + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + ... +Refer: split_thread2.c + + +## More Information + +### CRAM reference files + +The cram reference data is required for the read of sequence data in CRAM +format. The sequence data file may have it as embedded or as a reference to +the actual file. When it is a reference, it is downloaded locally, in the +cache directory for later usage. It will be stored in a directory structure +based on the MD5 checksum in the cache directory. + +Each chromosome in a reference file gets saved as a separate file with md5sum +as its path and name. The initial 4 numerals make the directory name and rest +as the file name (/<1st 2 of md5sum>/<2nd 2 of md5sum>/). + +The download would be attempted from standard location, EBI ENA +(https://www.ebi.ac.uk/ena). + + +### Bam1_t + +This structure holds the sequence data in BAM format. There are fixed and +variable size fields, basic and extended information on sequence +data. Variable size data and extended information are kept together in a +buffer, named data in bam1_t. Fields in the member named core, bam1_core_t, +and a few macros together support the storage and handling of the whole +sequence data. + +- core has a link to reference as a 0 based index in field tid. The mate / + reverse strand's link to reference is given by mtid. + +- Field pos and mpos gives the position in reference to which the sequence and + its mate / reverse strand match. + +- Field flag gives the properties of the given alignment. It shows the + alignment's orientation, mate status, read order etc. + +- Field qual gives the quality of the alignment read. + +- l_qname gives the length of the name of the alignment / read, l_extranul gives + the extra space used internally in the data field. + +- l_qseq gives the length of the alignment / read in the data field. + +-- n_cigar gives the number of CIGAR operations for the given alignment. + +- isize gives the insert size of the read / alignment. + +The bases in sequence data are stored by compressing 2 bases together in a +byte. When the reverse flag is set, the base data is reversed and +complemented from the actual read (i.e. if the forward read is ACTG, the +reverse read to be CAGT; it will be stored in SAM format with reversed and +complemented format as ACTG with reverse flag set). + +Macros bam_get_qname, bam_get_seq, bam_get_qual, bam_get_aux, bam_get_l_aux, +bam_seqi etc access the data field and retrieve the required data. The aux +macros support the retrieval of auxiliary data from the data field. + + +### Sam_hdr_t + +This structure holds the header information. This holds the number of targets +/ SQ lines in the file, each one's length, name and reference count to this +structure. It also has this information in an internal data structure for +easier access of each field of this data. + +When this data is shared or assigned to another variable of a different scope +or purpose, the reference count needs to be incremented to ensure that it is +valid till the end of the variable's scope. sam_hdr_incr_ref and it needs to +be destroyed as many times with sam_hdr_destroy api. + + +### Index + +Indices need the data to be sorted by position. They can be of different +types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai +for CRAM files. The index name can be passed along with the alignment file +itself by appending a specific character sequence. The apis can detect this +sequence and extract the index path. ##idx## is the sequence which separates +the file path and index path. + + +### Data files + +The data files can be a local file, a network file, a file accessible through +the web or in cloud storage like google and amazon. The data files can be +represented with URIs like file://, file://localhost/.., ,ftp://.., +gs+http[s].., s3+http[s]:// + diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000..40991d78f --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,106 @@ +HTS_DIR = ../ +include $(HTS_DIR)/htslib_static.mk + +CC = gcc +CFLAGS = -Wall -g -O0 + +#to statically link to libhts +LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS) + +#to dynamically link to libhts +#LDFLAGS = -L $(HTS_DIR) -lhts -Wl,-rpath, + +PRGS = flags split split2 cram read_fast read_header read_ref read_bam \ + read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \ + mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \ + mpileup modstate pileup_mod flags_field split_t1 split_t2 + +all: $(PRGS) + +flags: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@ $(LDFLAGS) + +split: + $(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@ $(LDFLAGS) + +split2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@ $(LDFLAGS) + +cram: + $(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@ $(LDFLAGS) + +read_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@ $(LDFLAGS) + +read_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@ $(LDFLAGS) + +read_ref: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@ $(LDFLAGS) + +read_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@ $(LDFLAGS) + +read_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@ $(LDFLAGS) + +dump_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@ $(LDFLAGS) + +add_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@ $(LDFLAGS) + +rem_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@ $(LDFLAGS) + +update_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@ $(LDFLAGS) + +mod_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@ $(LDFLAGS) + +mod_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@ $(LDFLAGS) + +mod_aux_ba: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@ $(LDFLAGS) + +write_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@ $(LDFLAGS) + +idx_on_write: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@ $(LDFLAGS) + +read_reg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@ $(LDFLAGS) + +read_multireg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@ $(LDFLAGS) + +pileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@ $(LDFLAGS) + +mpileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@ $(LDFLAGS) + +modstate: + $(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@ $(LDFLAGS) + +pileup_mod: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@ $(LDFLAGS) + +flags_field: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@ $(LDFLAGS) + +split_t1: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@ $(LDFLAGS) + +split_t2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@ $(LDFLAGS) + +clean: + find . -name "*.o" | xargs rm -rf + find . -name "*.dSYM" | xargs rm -rf + rm $(PRGS) + + diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 000000000..0438e1fc4 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,229 @@ +[![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib/samples) + +HTSlib is an implementation of a unified C library for accessing common file +formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing +data, and is the core library used by [samtools][2] and [bcftools][3]. + +A set of sample programs are available which showcases the usage of APIs in HTSlib. +They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage. +Further optimization and error handling might be required for actual usage. + + +[1]: http://samtools.github.io/hts-specs/ +[2]: http://github.com/samtools/samtools +[3]: http://samtools.github.io/bcftools/ + +### Building and using sample programs + +GCC and compatible compilers can be used to build these samples. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + +```sh + +# linking statically on a linux machine +gcc -g -o -I \ + /libhts.a -lcrypto -lm -lpthread -lcurl -llzma -lz -lbz2 + +# dynamically linking with libhts +gcc -g -o -I \ + -L -lhts -Wl,-rpath, + +``` + +In many cases, the alignment data are expected as sorted, compressed and +indexed. + +### The samples... + +[Flags][Flags] + + This application showcases the basic read of alignment files and flag + access. It reads and shows the count of read1 and read2 alignments. + +[Split][Split] + + This application showcases the basic read and write of alignment data. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. + +[Split2][Split2] + + This application showcases the output file format selection. It saves the + read1 and read2 as separate files in given directory, both as compressed + sam though the extensions are different. + +[Cram][Cram] + + This application showcases the different way in which cram reference data + is used for cram output creation. + +[Read_fast][Read_fast] + + This application showcases the fasta/fastq data read. + +[Read_header][Read_header] + + This application showcases the read and access of header data. It can show + all header line of given type, data of a given tag on a specific header + line or for all lines of given type. + +[Read_ref][Read_ref] + + This application showcases the read and access of header data. It shows + all reference names which has length equal or greather to given input. + +[Read_bam][Read_bam] + + This application showcases read of different alignment data fields. It + shows contents of each alignment. + +[Read_aux][Read_aux] + + This application showcases read of specific auxiliary tag data in + alignment. It shows the data retrieved using 2 APIs, one as a string with + tag data and other as raw data alternatively. + +[Dump_aux][Dump_aux] + + This application showcases read of all auxiliary tag data one by one in an + alignment. It shows the data retrieved. + +[Add_header][Add_header] + + This application showcases the write of header lines to a file. It adds + header line of types, SQ, RG, PG and CO and writes to standard output. + +[Remove_header][Remove_header] + + This application showcases removal of header line from a file. It removes + either all header lines of given type or one specific line of given type + with given unique identifier. Modified header is written on standard + output. + +[Update_header][Update_header] + + This application shows the update of header line fields, where update is + allowed. It takes the header line type, unique identifier for the line, + tag to be modified and the new value. Updated data is written on standard + output. + +[Mod_bam][Mod_bam] + + This application showcases the update of alignment data. It takes + alignment name, position of field to be modified and new value of + it. Modified data is written on standard output. + +[Mod_aux][Mod_aux] + + This application showcases the update of auxiliary data in alignment. It + takes alignment name, tag to be modified, its type and new value. Modified + data is written on standard output. + +[Mod_aux_ba][Mod_aux_ba] + + This application showcases the update of auxiliary array data in + alignment. It adds count of ATCGN base as an array in auxiliary data, + BA:I. Modified data is written on standard output. + +[Write_fast][Write_fast] + + This application showcases the fasta/fastq data write. It appends a dummy + data to given file. + +[Index_write][Index_write] + + This application showcases the creation of index along with output + creation. Based on file type and shift, it creates bai, csi or crai files. + +[Read_reg][Read_reg]: + + This application showcases the usage of region specification in alignment + read. + +[Read_multireg][Read_multireg]: + + This application showcases the usage of mulitple region specification in + alignment read. + +[Pileup][Pileup]: + + This application showcases the pileup api, where all alignments covering a + reference position are accessed together. It displays the bases covering + each position on standard output. + +[Mpileup][Mpileup]: + + This application showcases the mpileup api, which supports multiple input + files for pileup and gives a side by side view of them in pileup + format. It displays the bases covering each position on standard output. + +[Modstate][Modstate]: + + This application showcases the access of base modifications in + alignment. It shows the modifications present in an alignment and accesses + them using available APIs. There are 2 APIs and which one to be used can + be selected through input. + +[Pileup_mod][Pileup_mod]: + + This application showcases the base modification access in pileup mode. It + shows the pileup display with base modifications. + +[Flags_field][Flags_field] + + This application showcases the read of selected fields alone, reducing the + overhead / increasing the performance. It reads the flag field alone and + shows the count of read1 and read2. This has impact only on CRAM files. + +[Split_thread1][Split_thread1] + + This application showcases the use of threads in file handling. It saves + the read1 and read2 as separate files in given directory, one as sam and + other as bam. 2 threads are used for read and 1 each dedicated for each + output file. + +[Split_thread2][Split_thread2] + + This application showcases the use of thread pool in file handling. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. A pool of 4 threads is created and shared for both read + and write. + +### More Information + +More detailed documentation is available in the [DEMO.md][DEMO] with worked +examples per demonstration tool. + + +[Flags]: flags_demo.c +[Split]: split.c +[Split2]: split2.c +[Cram]: cram.c +[Read_fast]: read_fast.c +[Read_header]: read_header.c +[Read_ref]: read_refname.c +[Read_bam]: read_bam.c +[Read_aux]: read_aux.c +[Dump_aux]: dump_aux.c +[Add_header]: add_header.c +[Remove_header]: rem_header.c +[Update_header]: update_header.c +[Mod_bam]: mod_bam.c +[Mod_aux]: mod_aux.c +[Mod_aux_ba]: mod_aux_ba.c +[Write_fast]: write_fast.c +[Index_write]: index_write.c +[Read_reg]: index_reg_read.c +[Read_multireg]: index_multireg_read.c +[Pileup]: pileup.c +[Mpileup]: mpileup.c +[Modstate]: modstate.c +[Pileup_mod]: pileup_mod.c +[Flags_field]: flags_htsopt_field.c +[Split_thread1]: split_thread1.c +[Split_thread2]: split_thread2.c +[DEMO]: DEMO.md diff --git a/samples/add_header.c b/samples/add_header.c new file mode 100644 index 000000000..d1a2fc13c --- /dev/null +++ b/samples/add_header.c @@ -0,0 +1,128 @@ +/* add_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: add_header infile\n\ +Adds new header lines of SQ, RG, PG and CO typs\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, sq[] = "@SQ\tSN:TR1\tLN:100\n@SQ\tSN:TR2\tLN:50"; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //update_header infile header idval tag value + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //dump command line arguments for PG line + for (c = 0; c < argc; ++c) { + kputs(argv[c], &data); + kputc(' ', &data); + } + + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + printf("Failed to add SQ lines\n"); + goto end; + } + + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + printf("Failed to add RG line\n"); + goto end; + } + + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + ks_free(&data); + return ret; +} diff --git a/samples/cram.c b/samples/cram.c new file mode 100644 index 000000000..5f55e65d2 --- /dev/null +++ b/samples/cram.c @@ -0,0 +1,168 @@ +/* cram.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: cram infile reffile outdir\n\ +Dumps the input file alignments in cram format in given directory\n\ +1.cram has external reference\n\ +2.cram has reference embedded\n\ +3.cram has autogenerated reference\n\ +4.cram has no reference data in it\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL, *reffile = NULL; + char *file1 = NULL, *file2 = NULL, *file3 = NULL, *file4 = NULL, *reffmt1 = NULL, *reffmt2 = NULL; + int c = 0, ret = EXIT_FAILURE, size1 = 0, size2 = 0, size3 = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL, *outfile3 = NULL, *outfile4 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsFormat fmt1 = {0}, fmt2 = {0}, fmt3 = {0}, fmt4 = {0}; + + //cram infile reffile outdir + if (argc != 4) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + reffile = argv[2]; + outdir = argv[3]; + + //allocate space for option string and output file names + size1 = sizeof(char) * (strlen(reffile) + sizeof("cram,reference=") + 1); + size2 = sizeof(char) * (strlen(reffile) + sizeof("cram,embed_ref=1,reference=") + 1); + size3 = sizeof(char) * (strlen(outdir) + sizeof("/1.cram") + 1); + + reffmt1 = malloc(size1); reffmt2 = malloc(size2); + file1 = malloc(size3); file2 = malloc(size3); + file3 = malloc(size3); file4 = malloc(size3); + + if (!file1 || !file2 || !file3 || !file4 || !reffmt1 || !reffmt2) { + printf("Failed to create buffers\n"); + goto end; + } + + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + snprintf(file1, size3, "%s/1.cram", outdir); snprintf(file2, size3, "%s/2.cram", outdir); + snprintf(file3, size3, "%s/3.cram", outdir); snprintf(file4, size3, "%s/4.cram", outdir); + + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + printf("Failed to set output option\n"); + goto end; + } + + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM, wc as CRAM (equivalent to fmt3) + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + outfile3 = sam_open_format(file3, "wc", &fmt3); outfile4 = sam_open_format(file4, "wc", &fmt4); + if (!outfile1 || !outfile2 || !outfile3 || !outfile4) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1) || + (sam_hdr_write(outfile3, in_samhdr) == -1) || (sam_hdr_write(outfile4, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0 || + sam_write1(outfile2, in_samhdr, bamdata) < 0 || + sam_write1(outfile3, in_samhdr, bamdata) < 0 || + sam_write1(outfile4, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: +#define IF_OL(X,Y) if((X)) {(Y);} //if one liner + //cleanup + IF_OL(in_samhdr, sam_hdr_destroy(in_samhdr)); + IF_OL(infile, sam_close(infile)); + IF_OL(outfile1, sam_close(outfile1)); + IF_OL(outfile2, sam_close(outfile2)); + IF_OL(outfile3, sam_close(outfile3)); + IF_OL(outfile4, sam_close(outfile4)); + IF_OL(file1, free(file1)); + IF_OL(file2, free(file2)); + IF_OL(file3, free(file3)); + IF_OL(file4, free(file4)); + IF_OL(reffmt1, free(reffmt1)); + IF_OL(reffmt2, free(reffmt2)); + IF_OL(fmt1.specific, hts_opt_free(fmt1.specific)); + IF_OL(fmt2.specific, hts_opt_free(fmt2.specific)); + IF_OL(fmt3.specific, hts_opt_free(fmt3.specific)); + IF_OL(fmt4.specific, hts_opt_free(fmt4.specific)); + IF_OL(bamdata, bam_destroy1(bamdata)); + + return ret; +} diff --git a/samples/dump_aux.c b/samples/dump_aux.c new file mode 100644 index 000000000..49251fe04 --- /dev/null +++ b/samples/dump_aux.c @@ -0,0 +1,188 @@ +/* dump_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: dump_aux infile\n\ +Dump the aux tags from alignments\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + int ret_r = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //dump_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + data = NULL; + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to dump aux data\n"); + goto end; + } + else { + printf(" "); + } + data = bam_aux_next(bamdata, data); //get the next aux data + } + if (ENOENT != errno) { + printf("\nFailed to get aux data\n"); + goto end; + } + printf("\n"); + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_demo.c b/samples/flags_demo.c new file mode 100644 index 000000000..e03fc6cd8 --- /dev/null +++ b/samples/flags_demo.c @@ -0,0 +1,110 @@ +/* flags_demo.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags \n\ +Shows the count of read1 and read2 alignments\n\ +This shows basic reading and alignment flag access\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c new file mode 100644 index 000000000..4b64445e3 --- /dev/null +++ b/samples/flags_htsopt_field.c @@ -0,0 +1,115 @@ +/* flags_htsopt_field.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags_field \n\ +Shows the count of read1 and read2 alignments\n\ +This shows reading selected fields from CRAM file\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + printf("Failed to set htsoption\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c new file mode 100644 index 000000000..dbe8f15f9 --- /dev/null +++ b/samples/index_multireg_read.c @@ -0,0 +1,150 @@ +/* index_multireg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_multireg infile count regspec_csv\n\ + Reads alignment of a target matching to given region specifications\n\ + read_multireg infile.sam 2 R1:10-100,R2:200"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + char *ptr = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + unsigned int regcnt = 0; + char **regions = NULL; + + //read_multireg infile count regspec_csv + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + regcnt = atoi(argv[2]); + regions = calloc(regcnt, sizeof(char*)); + //set each regspec as separate entry in region array + ptr = argv[3]; + for (c = 0; ptr && (c < regcnt); ++c) { + regions[c] = ptr; + ptr = strchr(ptr, ','); + if (ptr) { *ptr = '\0'; ++ptr; } + } + + if (regcnt == 0) { + printf("Region count can not be 0\n"); + goto end; + } + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open files, use stdout as output SAM file for ease of display + infile = sam_open(inname, "r"); + outfile = sam_open("-", "w"); + if (!outfile || !infile) { + printf("Could not open in/out files\n"); + goto end; + } + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + printf("Failed to load the index\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + printf("Failed to get iterator\n"); + goto end; + } + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) + hts_idx_destroy(idx); + return ret; +} diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c new file mode 100644 index 000000000..346d5428f --- /dev/null +++ b/samples/index_reg_read.c @@ -0,0 +1,143 @@ +/* index_reg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: readreg infile idxfile region\n\ +Reads alignments matching to a specific region\n\ +\\. from start of file\n\ +\\* only unmapped reads\n\ +REFNAME all reads referring REFNAME\n\ +REFNAME:S all reads referring REFNAME and overlapping from S onwards\n\ +REFNAME:S-E all reads referring REFNAME overlapping from S to E\n\ +REFNAME:-E all reads referring REFNAME overlapping upto E\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *region = NULL; + char *idxfile = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + + //readreg infile indexfile region + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + idxfile = argv[2]; + region = argv[3]; + + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open input file\n"); + goto end; + } + //using stdout as output file for ease of dumping data + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open out file\n"); + goto end; + } + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + printf("Failed to load the index\n"); + goto end; + } + //can use sam_index_load if the index file is present in same location and follows standard naming conventions (i.e. .) + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + printf("Failed to get iterator\n"); + goto end; + } + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + } + return ret; +} diff --git a/samples/index_write.c b/samples/index_write.c new file mode 100644 index 000000000..8fd2bc968 --- /dev/null +++ b/samples/index_write.c @@ -0,0 +1,166 @@ +/* index_write.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: idx_on_write infile shiftsize outdir\n\ +Creates compressed sam file and index file for it in given directory\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outdir = NULL; + char *inname = NULL, *fileidx = NULL, *outname = NULL, outmode[4] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + //idx_on_write infile sizeshift outputdirectory + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + size = atoi(argv[2]); + outdir = argv[3]; + + //allocate space for output name - outdir/filename.ext.idxextNUL + c = strlen(basename(inname)) + strlen(outdir) + 10; + fileidx = malloc(sizeof(char) * c); + outname = malloc(sizeof(char) * c); + if (!fileidx || !outname) { + printf("Couldnt allocate memory\n"); + goto end; + } + //initialize bam storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if ((infile = sam_open(inname, "r"))) { + //get file type and create output names + if (infile->format.format == cram) { + //set as crai + snprintf(fileidx, c, "%s/%s.crai", outdir, basename(inname)); + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + } + else { + //set as either bai or csi based on interval + if (infile->format.format == sam && infile->format.compression == no_compression) { + //create as gzip compressed + snprintf(outname, c, "%s/%s.gz", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.gz.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + else { + //with same name as input + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + } + } + c = 0; + sam_open_mode(outmode + 1, outname, NULL); //set extra write options based on name + outfile = sam_open(outname, outmode); + if (!outfile || !infile) { + printf("Could not open files\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + printf("Failed to write header\n"); + goto end; + } + + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + printf("idx initialization failed\n"); + goto end; + } + //read and write alignments + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + } + if (c != -1) { + printf("Error in reading data\n"); + goto end; + } + //else EOF, save index + if (sam_idx_save(outfile)) { + printf("Could not save index\n"); + goto end; + } + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (fileidx) { + free(fileidx); + } + if (outname) { + free(outname); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/mod_aux.c b/samples/mod_aux.c new file mode 100644 index 000000000..d5ed18cde --- /dev/null +++ b/samples/mod_aux.c @@ -0,0 +1,221 @@ +/* mod_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux infile QNAME tag type val\n\ +Add/update the given aux tag to all alignments\n\ +type A-char C-int F-float Z-string\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *qname = NULL, *val = NULL; + char type = '\0'; + int ret = EXIT_FAILURE, ret_r = 0, length = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //mod_aux infile QNAME tag type val + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + tag = argv[3]; + type = argv[4][0]; + val = argv[5]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (strcasecmp(bam_get_qname(bamdata), qname)) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + continue; //not matching + } + + errno = 0; + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + int i = 0; float f = 0; + //tag not present append + switch (type) { + case 'f': + case 'd': + length = sizeof(float); + f = atof(val); + val = (const char*) &f; + type = 'f'; + break; + case 'C': + case 'S': + case 'I': + length = sizeof(int); + i = atoi(val); + val = (const char*) &i; + break; + case 'Z': + length = strlen(val) + 1; //1 for NUL termination + break; + case 'A': + length = 1; + break; + default: + printf("Invalid type mentioned\n"); + goto end; + break; + } + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + printf("Failed to append aux data, errno: %d\n", errno); + goto end; + } + } + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + if (auxtype != 'f' && auxtype != 'd') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_float(bamdata, tag, atof(val))) { + printf("Failed to update float data, errno: %d\n", errno); + goto end; + } + break; + case 'C': + case 'S': + case 'I': + if (auxtype != 'c' && auxtype != 'C' && auxtype != 's' && auxtype != 'S' && auxtype != 'i' && auxtype != 'I') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + printf("Failed to update int data, errno: %d\n", errno); + goto end; + } + break; + case 'Z': + if (auxtype != 'Z') { + printf("Invalid aux type passed\n"); + goto end; + } + length = strlen(val) + 1; //1 for NUL termination + if (bam_aux_update_str(bamdata, tag, length, val)) { + //with length as -1, length will be detected based on null terminated val data + printf("Failed to update string data, errno: %d\n", errno); + goto end; + } + break; + case 'A': + if (auxtype != 'A') { + printf("Invalid aux type passed\n"); + goto end; + } + //update the char data directly on buffer + *(data+1) = val[0]; + break; + default: + printf("Invalid data type\n"); + goto end; + break; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c new file mode 100644 index 000000000..8ef90ee1e --- /dev/null +++ b/samples/mod_aux_ba.c @@ -0,0 +1,147 @@ +/* mod_aux_ba.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux_ba infile\n\ +Updates the count of bases as an aux array on all alignments\n\ +BA:B:I,count of ACTGN\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int i = 0, ret = EXIT_FAILURE, ret_r = 0; + uint32_t cnt[5] = {0}; //A C G T N + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + + //mod_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + memset(cnt, 0, sizeof(cnt)); + for (i = 0; i < bamdata->core.l_qseq; ++i) { + switch (seq_nt16_str[bam_seqi(bam_get_seq(bamdata),i)]) { + case 'A': + ++cnt[0]; + break; + case 'C': + ++cnt[1]; + break; + case 'G': + ++cnt[2]; + break; + case 'T': + ++cnt[3]; + break; + default: //N + ++cnt[4]; + break; + } + } + + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + printf("Failed to update base array, errno %d", errno); + goto end; + } + + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_bam.c b/samples/mod_bam.c new file mode 100644 index 000000000..9f1eb324e --- /dev/null +++ b/samples/mod_bam.c @@ -0,0 +1,229 @@ +/* mod_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_bam infile QNAME fieldpos newval\n\ +Modifies the alignment data field\n\ +fieldpos - 1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *qname = NULL; + char *val = NULL; + int c = 0, ret = EXIT_FAILURE, field = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + + //mod_bam infile QNAME fieldpos newval + if (argc != 5) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + //1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL + field = atoi(argv[3]); + val = argv[4]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r")) || !(outfile = sam_open("-", "w"))) { + printf("Could not open input/output\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + ret = 0; + if (!strcasecmp(qname, bam_get_qname(bamdata))) { + //the required one + switch(field) { + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + printf("Invalid reference name\n"); + ret = -1; + break; + } + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + uint32_t *cigar = NULL; + size_t size = 0; + ssize_t ncigar = 0; + bam1_t *newbam = bam_init1(); + if (!newbam) { + printf("Failed to create new bam data\n"); + ret = -1; + break; + } + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + printf("Failed to parse cigar\n"); + ret = -1; + break; + } + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, bamdata->core.pos, bamdata->core.qual, + ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + printf("Failed to set bamdata\n"); + ret = -1; + break; + } + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + + bam_destroy1(bamdata); + bamdata = newbam; + } + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + i = strlen(val); + if (bamdata->core.l_qseq != i) { + printf("SEQ length different\n"); + ret = -1; + //as it is different, have to update quality data and cigar data as well and more info is required for it, which is not handled in this sample + //accessing raw memory and moving is one option; creating and using new bam1_t object is another option. + break; + } + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + i = strlen(val); + if (i != bamdata->core.l_qseq) { + printf("Qual length different than sequence\n"); + ret = -1; + break; + } + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + break; + default: + printf("Invalid input\n"); + goto end; + break; + } + if (ret < 0) { + printf("Failed to set new data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write bam data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + + if (ret_r == -1 || ret != EXIT_FAILURE) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/modstate.c b/samples/modstate.c new file mode 100644 index 000000000..976391684 --- /dev/null +++ b/samples/modstate.c @@ -0,0 +1,190 @@ +/* modstate.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: modstate infile option\n\ +Shows the base modifications on the alignment\n\ +Option can be 1 or 2 to select the api to use\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0 , r = 0, j = 0, pos = 0, opt = 0, k = 0, cnt = 0, *bm = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + hts_base_mod_state *ms = NULL; + + + //modstate infile 1/2 + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + opt = atoi(argv[2]) - 1; //option 1 or 2? + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + if (!(ms = hts_base_mod_state_alloc())) { + printf("Failed to allocate state memory\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + i = 0; + data = bam_get_seq(bamdata); + if (bam_parse_basemod(bamdata, ms)) { + printf("Failed to parse the base mods\n"); + goto end; + } + //dump the modifications + printf("Modifications:"); + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + } + } + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + if (i == pos) + i++; //skip the modification already displayed + if (!r) { + for (; i < bamdata->core.l_qseq; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + break; + } + } + if (r <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + } + printf("\n"); + } + + if (ret_r == -1) { + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand?"-/bottom/reverse":"+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + + if (ms) { + hts_base_mod_state_free(ms); + } + return ret; +} diff --git a/samples/mpileup.c b/samples/mpileup.c new file mode 100644 index 000000000..fe933748e --- /dev/null +++ b/samples/mpileup.c @@ -0,0 +1,204 @@ +/* mpileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mpileup infile ...\n\ +Shows the mpileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf** conf = NULL; + bam_mplp_t mplpiter = NULL; + int tid = -1, input = 0, k = 0, dpt = 0, *depth = NULL; + hts_pos_t refpos = -1; + const bam_pileup1_t **plp = NULL; + + //infile ... + if (argc < 2) { + print_usage(stderr); + goto end; + } + if ((conf = calloc(argc - 1, sizeof(plpconf*)))) { + for (input = 0; input < argc - 1; ++input) { + conf[input] = calloc(1, sizeof(plpconf)); + } + } + depth = calloc(argc - 1, sizeof(int)); + plp = calloc(argc - 1, sizeof(bam_pileup1_t*)); + if (!conf || !depth || !plp) { + printf("Failed to allocate memory\n"); + goto end; + } + for (input = 0; input < argc - 1; ++input) { + conf[input]->inname = argv[input+1]; + } + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + for(input = 0; input < argc - 1; ++input) { + if (!(conf[input]->infile = sam_open(conf[input]->inname, "r"))) { + printf("Could not open %s\n", conf[input]->inname); + goto end; + } + //read header + if (!(conf[input]->in_samhdr = sam_hdr_read(conf[input]->infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + } + + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + printf("Failed to initialize mpileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : + (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + } + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf) { + for (input = 0; input < argc - 1; ++input) { + if (conf[input] && conf[input]->in_samhdr) { + sam_hdr_destroy(conf[input]->in_samhdr); + } + if (conf[input] && conf[input]->infile) { + sam_close(conf[input]->infile); + } + if (conf[input]) { + free(conf[input]); + } + } + free(conf); + } + + if (bamdata) { + bam_destroy1(bamdata); + } + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + if (depth) { + free(depth); + } + if (plp) { + free(plp); + } + return ret; +} diff --git a/samples/pileup.c b/samples/pileup.c new file mode 100644 index 000000000..11e2fb02f --- /dev/null +++ b/samples/pileup.c @@ -0,0 +1,183 @@ +/* pileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup infile\n\ +Shows the pileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + can do any alignment specific allocation / data storage here in param cd + it can hold either a float, 64 bit int or a pointer + when using cd, initialize and use as it will be reused after destructor*/ + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + deallocate any alignment specific allocation made in constructor and stored in param cd*/ + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, n = -1, j = 0, k = 0; + int refpos = -1; + const bam_pileup1_t *plp = NULL; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + return ret; +} diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c new file mode 100644 index 000000000..24d6cf539 --- /dev/null +++ b/samples/pileup_mod.c @@ -0,0 +1,218 @@ +/* pileup_mod.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup_mod infile\n\ +Shows the pileup api usage with base modification.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //plpconf *conf= (plpconf*)data; can use this to access anything required from the data in pileup init + + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + if (!cd->p) { + printf("Failed to allocate base modification state\n"); + return 1; + } + + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, depth = -1, j = 0, k = 0, inslen = 0, dellen = 0, modlen = 0; + #define NMODS 5 + hts_base_mod mods[NMODS] = {0}; //ACGT N + int refpos = -1; + const bam_pileup1_t *plp = NULL; + kstring_t insdata = KS_INITIALIZE; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + printf("Failed to get modifications\n"); + goto end; + } + + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + printf("Failed to get insertion status\n"); + goto end; + } + + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', + modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + } + } + } + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + ks_free(&insdata); + return ret; +} diff --git a/samples/read_aux.c b/samples/read_aux.c new file mode 100644 index 000000000..cbf972b98 --- /dev/null +++ b/samples/read_aux.c @@ -0,0 +1,207 @@ +/* read_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_aux infile tag\n\ +Read the given aux tag from alignments either as SAM string or as raw data\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL; + int c = 0, ret = EXIT_FAILURE, ret_r = 0, i = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + kstring_t sdata = KS_INITIALIZE; + + //read_aux infile tag + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + tag = argv[2]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; i++; + ks_clear(&sdata); + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + } + else if (c == 0 && errno == ENOENT) { + //tag not present + printf("Tag not present\n"); + } + else { + //error + printf("Failed to get tag\n"); + goto end; + } + } + else { + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + //tag data not returned, errono gives the reason + if (errno == ENOENT) { + printf("Tag not present\n"); + } + else { + printf("Invalid aux data\n"); + } + } + else { + //got the tag, read and print + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to read aux data\n"); + goto end; + } + printf("\n"); + } + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + ks_free(&sdata); + return ret; +} diff --git a/samples/read_bam.c b/samples/read_bam.c new file mode 100644 index 000000000..7fca8c55d --- /dev/null +++ b/samples/read_bam.c @@ -0,0 +1,139 @@ +/* read_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_bam infile\n\ +Shows the alignment data from file\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tidname = NULL, *flags = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + uint32_t *cigar = NULL; + + + //read_bam infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + printf("FLG: %d - %s\n", bamdata->core.flag, flags); //flag is available in core structure + free((void*)flags); + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + printf("RNAME/TID: %d - %s\n", bamdata->core.tid, tidname? tidname: "" ); //retrieves the target name using the value in bam and by referring the header + printf("POS: %"PRIhts_pos"\n", bamdata->core.pos + 1); //internally position is 0 based and on text output / SAM it is 1 based + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + printf("CGR: "); + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + + data = bam_get_seq(bamdata); //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + printf("\nLength doesnt matches to cigar data\n"); + goto end; + } + + printf("SEQ: "); + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + } + printf("\nQUAL: "); + for (int i = 0; i < bamdata->core.l_qseq ; ++i) { + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + } + printf("\n\n"); + } + + if (ret_r == -1) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_fast.c b/samples/read_fast.c new file mode 100644 index 000000000..f74b25515 --- /dev/null +++ b/samples/read_fast.c @@ -0,0 +1,116 @@ +/* read_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_fast \n\ +Reads the fasta/fastq file and shows the content.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + printf("Invalid file specified\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + } + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_header.c b/samples/read_header.c new file mode 100644 index 000000000..eb14daea5 --- /dev/null +++ b/samples/read_header.c @@ -0,0 +1,173 @@ +/* read_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_header infile header [id val] [tag]\n\ +This shows given tag from given header or the whole line\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *tag = NULL, *idval = NULL; + char *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //read_header infile header tag + if (argc < 3 || argc > 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { //header and tag + tag = argv[3]; + //find unique identifier field name for requested header type + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + } + else if (argc == 5) { //header id val + id = argv[3]; + idval = argv[4]; + } + else if (argc == 6) { //header id val tag + id = argv[3]; + idval = argv[4]; + tag = argv[5]; + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (id && idval) { + if (tag) { + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + } + else { + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + } + else if (ret == -1) { + printf("No matching tag found\n"); + goto end; + } + else { + printf("Failed to find header line\n"); + goto end; + } + } + else { + //get count of given header type + linecnt = sam_hdr_count_lines(in_samhdr, header); + if (linecnt == 0) { + printf("No matching line found\n"); + goto end; + } + for (c = 0; c < linecnt; ++c ) { + if (tag) { + //non CO, get the tag requested + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + } + else { + //CO header, there are no tags but the whole line + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + continue; + } + else if (ret == -1) { + printf("Tag not present\n"); + continue; + } + else { + printf("Failed to get tag\n"); + goto end; + } + } + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/read_refname.c b/samples/read_refname.c new file mode 100644 index 000000000..adbc71183 --- /dev/null +++ b/samples/read_refname.c @@ -0,0 +1,125 @@ +/* read_refname.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_refname infile minsize\n\ +This shows name of references which has length above the given size\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0, pos = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + int64_t minsize = 0, size = 0; + + if (argc != 3 && argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + if (argc == 3) { + minsize = atoll(argv[2]); + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + if (linecnt <= 0) { + if (!linecnt) { + printf("No reference line present\n"); + } + else { + printf("Failed to get reference line count\n"); + } + goto end; + } + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + printf("Failed to get length\n"); + goto end; + } + else if (ret == -1) { + //length not present, ignore + continue; + } + //else have length + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + printf("Failed to get id for reference data\n"); + goto end; + } + printf("%d,%s,%s\n", pos, id, data.s); + pos++; + } + + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/rem_header.c b/samples/rem_header.c new file mode 100644 index 000000000..a0b6510fb --- /dev/null +++ b/samples/rem_header.c @@ -0,0 +1,138 @@ +/* rem_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: rem_header infile header [id]\n\ +Removes header line of given type and id\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *idval = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc <3 || argc > 4) { + //3 & 4 are ok, 3-> all of given header type, 4->given id of given header type to be removed + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { + idval = argv[3]; + } + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + //CO field can be removed using the position of it using sam_hdr_remove_line_pos + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + if (idval) { + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + printf("Failed to remove header line\n"); + goto end; + } + } + else { + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + printf("Failed to remove header line\n"); + goto end; + } + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanupq + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/sample.ref.fa b/samples/sample.ref.fa new file mode 100644 index 000000000..5789e8c42 --- /dev/null +++ b/samples/sample.ref.fa @@ -0,0 +1,4 @@ +>T1 T1:1-40 +AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT +>T2 T2:1:40 +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT diff --git a/samples/sample.sam b/samples/sample.sam new file mode 100644 index 000000000..e56efd69f --- /dev/null +++ b/samples/sample.sam @@ -0,0 +1,29 @@ +@HD VN:1.17 SO:unknown +@SQ SN:T1 LN:40 +@SQ SN:T2 LN:40 +@CO @SQ SN* LN* AH AN AS DS M5 SP TP UR +@CO @RG ID* BC CN DS DT FO KS LB PG PI PL PM PU SM +@CO @PG ID* PN CL PP DS VN +@CO this is a dummy alignment file to demonstrate different abilities of hts apis +@CO QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… +@CO 1234567890123456789012345678901234567890 +@CO AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT T1 +@CO TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT T2 +@CO ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped +@CO A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment +ITR1 99 T1 5 40 4M = 33 10 ACTG ()() +ITR2 147 T2 23 49 2M = 35 -10 TT ** +ITR2M 99 T2 35 51 2M = 23 10 AA && +ITR1M 147 T1 33 37 4M = 5 -10 ACTG $$$$ +UNMP1 73 T1 21 40 3M * 0 5 GGG &&1 +UNMP2 141 * 0 0 * * 0 7 AA && +UNMP3 77 * 0 0 * * 0 5 GGG &&2 +A1 99 T1 25 35 6M = 31 8 ACTGTT ****** +A2 147 T1 31 33 6M = 25 -8 ACTGTT ()()() +A3 147 T2 23 47 2M1X = 12 -5 TTG ((( +A4 99 T2 12 50 3M = 23 5 GAA ()( +A5 355 T1 25 35 4M = 33 5 ACTG PPPP +B1 99 T1 25 35 6M = 31 8 GCTATT ****** +B3 147 T2 23 47 2M1X = 12 -5 TAG ((( +B4 99 T2 12 50 3M = 23 5 GAT ()( +B5 355 T1 25 35 4M = 33 5 AGTG PPPP diff --git a/samples/split.c b/samples/split.c new file mode 100644 index 000000000..2eb9e6b79 --- /dev/null +++ b/samples/split.c @@ -0,0 +1,153 @@ +/* split.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the basic writing of output\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split2.c b/samples/split2.c new file mode 100644 index 000000000..2354abfe3 --- /dev/null +++ b/samples/split2.c @@ -0,0 +1,158 @@ +/* split2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows file type selection through name and format api\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL, mode1[5] = "w", mode2[5] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam.gz") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam.gz", outdir); //name of Read1 file + snprintf(file2, size, "%s/2.sam", outdir); //name of Read2 file + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + printf("Failed to set open mode\n"); + goto end; + } + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread1.c b/samples/split_thread1.c new file mode 100644 index 000000000..40d2dfdc2 --- /dev/null +++ b/samples/split_thread1.c @@ -0,0 +1,161 @@ +/* split_thread1.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t1 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of basic thread in htslib\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread2.c b/samples/split_thread2.c new file mode 100644 index 000000000..dab897b5f --- /dev/null +++ b/samples/split_thread2.c @@ -0,0 +1,171 @@ +/* split_thread2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t2 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of thread pool\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsThreadPool tpool = {NULL, 0}; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + printf("Failed to initialize the thread pool\n"); + goto end; + } + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + return ret; +} diff --git a/samples/update_header.c b/samples/update_header.c new file mode 100644 index 000000000..f6b1680cd --- /dev/null +++ b/samples/update_header.c @@ -0,0 +1,131 @@ +/* update_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: update_header infile header idval tag value\n\ +Updates the tag's value on line given in id on header of given type\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *idval = NULL, *val = NULL, *header = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + idval = argv[3]; + tag = argv[4]; + val = argv[5]; + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + printf("This sample doesnt not support modifying HD fields\n"); + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + tag = NULL; + id = ""; + printf("This sample doesnt not support modifying CO fields\n"); + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/write_fast.c b/samples/write_fast.c new file mode 100644 index 000000000..ef7817683 --- /dev/null +++ b/samples/write_fast.c @@ -0,0 +1,101 @@ +/* write_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: write_fast \n\ +Appends a fasta/fastq file.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outname = NULL; //output file name + int ret = EXIT_FAILURE; + samFile *outfile = NULL; //sam file + sam_hdr_t *out_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + char mode[4] = "a"; + + if (argc != 2) { + print_usage(stdout); + goto end; + } + outname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + printf("Invalid file name\n"); + goto end; + } + //open output file + if (!(outfile = sam_open(outname, mode))) { + printf("Could not open %s\n", outname); + goto end; + } + //dummy data + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) { + printf("Failed to set data\n"); + goto end; + } + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (out_samhdr) { + sam_hdr_destroy(out_samhdr); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} From 62909e2a9f1b9e3ad9a7ba56b073242ef8f8ab8e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 3 Jul 2023 15:34:48 +0100 Subject: [PATCH 69/70] NEWS updates for pending release Co-authored-by: Rob Davies --- NEWS | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/NEWS b/NEWS index d2c168ee7..10fdc1b5a 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,170 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Updates +------- + +* Using CRAM 3.1 no longer gives a warning about the specification + being draft. Note CRAM 3.0 is still the default output format. + (PR#1583) + +* Replaced use of sprintf with snprintf, to silence potential warnings + from Apple's compilers and those who implement similar checks. + (PR#1594, fixes #1586. Reported by Oleksii Nikolaienko) + +* Fastq output will now generate empty records for reads with no + sequence data (i.e. sequence is "*" in SAM format). (PR#1576, + fixes samtools/samtools#1576. Reported by Nils Homer) + +* CRAM decoding speed-ups. (PR#1580) + +* A new MN aux tag can now be used to verify that MM/ML base modification + data has not been broken by hard clipping. (PR#1590, PR#1612. See also + PR samtools/hts-specs#714 and issue samtools/hts-specs#646. + Reported by Jared Simpson) + +* The base modification API has been improved to make it easier for callers + to tell unchecked bases from unmodified ones. (PR#1636, fixes #1550. + Requested by Chris Wright) + +* A new bam_mods_queryi() API has been added to return additional + data about the i-th base modification returned by bam_mods_recorded(). + (PR#1636, fixes #1550 and #1635. Requested by Jared Simpson) + +* Speed up index look-ups for whole-chromosome queries. (PR#1596) + +* Mpileup now merges adjacent (mis)match CIGAR operations, so CIGARs + using the X/= operators give the same results as if the M operator + was used. (PR#1607, fixes #1597. Reported by Marcel Martin) + +* It's now possible to call bcf_sr_set_regions() after adding readers + using bcf_sr_add_reader() (previously this returned an error). Doing so + will discard any unread data, and reset the readers so they iterate over + the new regions. (PR#1624, fixes samtools/bcftools#1918. Reported by + Gregg Thomas) + +* The synced BCF reader can now accept regions with reference names including + colons and hyphens, by enclosing them in curly braces. For example, + {chr_part:1-1001}:10-20 will return bases 10 to 20 from reference + "chr_part:1-1001". (PR#1630, fixes #1620. Reported by Bren) + +* Add a "samples" directory with code demonstrating usage of HTSlib plus + a tutorial document. (PR#1589) + +Build changes +------------- + +* Htscodecs has been updated to 1.5.1 (PR#1654) + +* Htscodecs SIMD code now works with Apple multiarch binaries. + (PR#1587, HTSlib fix for samtools/htscodecs#76. Reported by John Marshall) + +* Improve portability of "expr" usage in version.sh. + (PR#1593, fixes #1592. Reported by John Marshall) + +* Improve portability to *BSD targets by ensuring _XOPEN_SOURCE is defined + correctly and that source files properly include "config.h". Perl + scripts also now all use #!/usr/bin/env instead of assuming that + it's in /usr/bin/perl. (PR#1628, fixes #1606. + Reported by Robert Clausecker) + +* Fixed NAME entry in htslib-s3-plugin man page so the whatis and apropos + commands find it. (PR#1634, thanks to Étienne Mollier) + +* Assorted dependency tracking fixes. (PR#1653, thanks to John Marshall) + +Documentation updates +--------------------- + +* Changed Alpine build instructions as they've switched back to using openssl. + (PR#1609) + +* Recommend using -rdynamic when statically linking a libhts.a with + plugins enabled. (PR#1611, thanks to John Marshall. Fixes #1600, + reported by Jack Wimberley) + +* Fixed example in docs for sam_hdr_add_line(). (PR#1618, thanks to kojix2) + +* Improved test harness for base modifications API. (PR#1648) + +Bug fixes +--------- + +* Fix a major bug when searching against a CRAM index where one container + has start and end coordinates entirely contained within the previous + container. This would occasionally miss data, and sometimes return much + more than required. The bug affected versions 1.11 to 1.17, although the + change in 1.11 was bug-fixing multi-threaded index queries. This bug did + not affect index building. There is no need to reindex your CRAM files. + (PR#1574, PR#1640. Fixes #1569, #1639, samtools/samtools#1808, + samtools/samtools#1819. Reported by xuxif, Jens Reeder and Jared Simpson) + +* Prevent CRAM blocks from becoming too big in files with short + sequences but very long aux tags. (PR #1613) + +* Fix bug where the CRAM decoder for CONST_INT and CONST_BYTE + codecs may incorrectly look for extra data in the CORE block. + Note that this bug only affected the experimental CRAM v4.0 decoder. + (PR#1614) + +* Fix crypt4gh redirection so it works in conjunction with non-file + IO, such as using htsget. (PR#1577) + +* Improve error checking for the VCF POS column, when facing invalid + data. (PR#1575, replaces #1570 originally reported and fixed + by Colin Nolan.) + +* Improved error checking on VCF indexing to validate the data is BGZF + compressed. (PR#1581) + +* Fix bug where bin number calculation could overflow when making iterators + over regions that go to the end of a chromosome. (PR#1595) + +* Backport attractivechaos/klib#78 (by Pall Melsted) to HTSlib. + Prevents infinite loops in kseq_read() when reading broken gzip files. + (PR#1582, fixes #1579. Reported by Goran Vinterhalter) + +* Backport attractivechaos/klib@384277a (by innoink) to HTSlib. + Fixes the kh_int_hash_func2() macro definition. + (PR#1599, fixes #1598. Reported by fanxinping) + +* Remove a compilation warning on systems with newer libcurl releases. + (PR#1572) + +* Windows: Fixed BGZF EOF check for recent MinGW releases. (PR#1601, + fixes samtools/bcftools#1901) + +* Fixed bug where tabix would not return the correct regions for files + where the column ordering is end, ..., begin instead of begin, ..., end. + (PR#1626, fixes #1622. Reported by Hiruna Samarakoon) + +* sam_format_aux1() now always NUL-terminates Z/H tags. (PR#1631) + +* Ensure base modification iterator is reset when no MM tag is present. + (PR#1631, PR#1647) + +* Fix segfault when attempting to write an uncompressed BAM file opened using + hts_open(name, "wbu"). This was attempting to write BAM data without + wrapping it in BGZF blocks, which is invalid according to the BAM + specification. "wbu" is now internally converted to "wb0" to output + uncompressed data wrapped in BGZF blocks. (PR#1632, fixes #1617. + Reported by Joyjit Daw) + +* Fixed over-strict bounds check in probaln_glocal() which caused it to make + sub-optimal alignments when the requested band width was greater than the + query length. (PR#1616, fixes #1605. Reported by Jared Simpson) + +* Fixed possible double frees when handling errors in bcf_hdr_add_hrec(), + if particular memory allocations fail. (PR#1637) + +* Ensure that bcf_hdr_remove() clears up all pointers to the items removed + from dictionaries. Failing to do this could have resulted in a call + requesting a deleted item via bcf_hdr_get_hrec() returning a stale pointer. + (PR#1637) + +* Stop the gzip decompresser from finishing prematurely when an empty + gzip block is followed by more data. (PR#1643, PR#1646) + Noteworthy changes in release 1.17 (21st February 2023) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 2425ce962eed21f216d87470cef26c59805fc5c4 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 3 Mar 2023 11:55:32 +0000 Subject: [PATCH 70/70] formatting update --- samples/DEMO.md | 5 +++++ samples/README.md | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/DEMO.md b/samples/DEMO.md index 1f14b7485..911792899 100644 --- a/samples/DEMO.md +++ b/samples/DEMO.md @@ -311,6 +311,7 @@ reference in it or with no reference data at all. It can also be created using an autogenerated reference, based on consensus with-in the alignment data. The reference detail can be set to an htsFormat structure using hts_parse_format api and used with sam_open_format api to create appropriate CRAM file. + ... snprintf(reffmt1, size1, "cram,reference=%s", reffile); snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); @@ -410,6 +411,7 @@ Shows the 2nd SQ line's LN field value. ./read_header /tmp/sample.sam.gz SQ SN T2 LN Below code excerpt shows the reference names which has length above given value. + ... linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count ... @@ -784,6 +786,7 @@ At the end of write, sam_idx_save api need to be invoked to save the index. if (sam_write1(outfile, in_samhdr, bamdata) < 0) { ... if (sam_idx_save(outfile)) { + ... Refer:index_write.c Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. @@ -1314,6 +1317,7 @@ HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments to api. The thread pool has to be released with hts_tpool_destroy. Below excerpt shows file specific thread pool, + ... //create file specific threads if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading @@ -1325,6 +1329,7 @@ Below excerpt shows file specific thread pool, Refer: split_thread1.c Below excerpt shows thread pool shared across files, + ... //create a pool of 4 threads if (!(tpool.pool = hts_tpool_init(4))) { diff --git a/samples/README.md b/samples/README.md index 0438e1fc4..ab5481dea 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,5 +1,3 @@ -[![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib/samples) - HTSlib is an implementation of a unified C library for accessing common file formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing data, and is the core library used by [samtools][2] and [bcftools][3].