From 3c1ea37046e701afa1dffd07ea72e10b9b9eb8e2 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 24 Sep 2020 15:58:49 +1000 Subject: [PATCH 001/114] Update hts.c Include unistd to guarantee acccess to R_OK definition --- hts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hts.c b/hts.c index 4552b5a80..d0ca8a5f1 100644 --- a/hts.c +++ b/hts.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include From 450c91271ecf688150c4f35d75c4a8d0381c848d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 25 Sep 2020 17:16:43 +0100 Subject: [PATCH 002/114] Include and directly where needed Similarly to the previous commit, sam.c needs explicitly for environments in which it doesn't get it via or similar. Both htslib/sam.h and test/test-vcf-api.c use errno but get it only semi-accidentally via htslib/kstring.h. Include explicitly, and similarly for strerror(). Various test/*.c programs that use getopt() need include only rather than , which is needed only for getopt_long(). --- htslib/sam.h | 1 + sam.c | 1 + test/plugins-dlhts.c | 2 +- test/test-vcf-api.c | 2 ++ test/test_index.c | 2 +- test/test_kstring.c | 2 +- test/test_realn.c | 2 +- test/test_str2int.c | 2 +- test/test_view.c | 1 - 9 files changed, 9 insertions(+), 6 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index f3b684291..9e595ae12 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_SAM_H #define HTSLIB_SAM_H +#include #include #include "hts.h" #include "hts_endian.h" diff --git a/sam.c b/sam.c index 02cd6092e..7a58e361e 100644 --- a/sam.c +++ b/sam.c @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index 0e4638d1f..aa98ef3f5 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -37,9 +37,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include #include #include +#include #ifndef EPROTONOSUPPORT #define EPROTONOSUPPORT ENOSYS diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 65912e3eb..87bce4aab 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -24,7 +24,9 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include +#include #include "../htslib/hts.h" #include "../htslib/vcf.h" diff --git a/test/test_index.c b/test/test_index.c index 0740427ab..402879666 100644 --- a/test/test_index.c +++ b/test/test_index.c @@ -24,7 +24,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include +#include #include "../htslib/sam.h" #include "../htslib/vcf.h" diff --git a/test/test_kstring.c b/test/test_kstring.c index 5923ba2a7..8dcce6b5b 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../htslib/kstring.h" diff --git a/test/test_realn.c b/test/test_realn.c index b96ef9d59..3f511704b 100644 --- a/test/test_realn.c +++ b/test/test_realn.c @@ -28,8 +28,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include +#include #include "../htslib/sam.h" #include "../htslib/hts.h" diff --git a/test/test_str2int.c b/test/test_str2int.c index 6345fdc4c..70e799169 100644 --- a/test/test_str2int.c +++ b/test/test_str2int.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../textutils_internal.h" diff --git a/test/test_view.c b/test/test_view.c index 604ec42c6..30e604610 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -29,7 +29,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include #include "../cram/cram.h" From 30d82d6bdd6ab2bfc00d81d325a678dd3b3b5e48 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 30 Sep 2020 09:37:45 +0100 Subject: [PATCH 003/114] Variable name used in dlsym corrected. The hfile irods plugin depends on a constructed symbol name which was no longer being used. --- plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin.c b/plugin.c index dbbe03eab..d5c1981ca 100644 --- a/plugin.c +++ b/plugin.c @@ -153,7 +153,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * const char *basename = slash? slash+1 : filename; kputsn(basename, strcspn(basename, ".-+"), &symbolg); - *(void **) &sym = dlsym(lib, symbol); + *(void **) &sym = dlsym(lib, symbolg.s); free(symbolg.s); if (sym == NULL) goto error; } From 14a83590dd5944ba6b5b5f7cbd85237623bd40a1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 6 Oct 2020 10:13:30 +0100 Subject: [PATCH 004/114] Replace the indexing range check error message. (#1151) This code detects when the end position is too large for the current index. It then computes an adjusted n_lvls parameter and tells the user to switch to CSI with this parameter. That has numerous failings though. 1. The only user-adjustable parameter is min-shift, not levels, so the advice is impossible to follow. 2. CSI auto-scales based on SQ headers, so we don't need to be explicit anyway (assuming we currently have a BAI index that doesn't fit). 3. If we already were using CSI index on SAM/BAM or VCF/BCF and if we did have the configuration options to do what it asks, it wouldn't fix the problem. Given CSI auto-scales, if we see this message it's because the alignments don't match the headers; either the header has been replaced by something inappropriate or there are bugs in the aligner that has emitted rogue POS fields. There is no fix to the index that actually fixes the data; it's simply brushing the problem under the carpet. --- hts.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/hts.c b/hts.c index d0ca8a5f1..afa71987f 100644 --- a/hts.c +++ b/hts.c @@ -1865,26 +1865,16 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); if (tid < 0 || (beg <= maxpos && end <= maxpos)) return 0; - int64_t max = end > beg ? end : beg, s = 1 << 14; - int n_lvls = 0; - while (max > s) { - n_lvls++; - s <<= 3; - } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " - "with min_shift = %d, n_lvls = %d. Try using " - "min_shift = 14, n_lvls >= %d", - beg, end, - idx->min_shift, idx->n_lvls, - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos + " cannot be stored in a csi index. " + "Please check headers match the data", + beg, end); } else { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " - "Try using a csi index with min_shift = 14, " - "n_lvls >= %d", - beg, end, idx_format_name(idx->fmt), - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos + " cannot be stored in a %s index. Try using a csi index", + beg, end, idx_format_name(idx->fmt)); } errno = ERANGE; return -1; From ff495a966fd59e29e3d3d176c6d854b346dc6e0e Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:49:23 +0200 Subject: [PATCH 005/114] Bug fix: handling of CRLF line terminators in sam_parse_worker(). --- sam.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sam.c b/sam.c index 7a58e361e..321787089 100644 --- a/sam.c +++ b/sam.c @@ -2508,9 +2508,17 @@ static void *sam_parse_worker(void *arg) { // However this is an API change so for now we copy. char *nl = strchr(cp, '\n'); - if (!nl) nl = cp_end; - if (*nl) *nl++ = '\0'; - kstring_t ks = {nl-cp, gl->alloc, cp}; + char *line_end; + if (nl) { + line_end = nl; + if (line_end > cp && *(line_end - 1) == '\r') + line_end--; + nl++; + } else { + nl = line_end = cp_end; + } + *line_end = '\0'; + kstring_t ks = { line_end - cp, gl->alloc, cp }; if (sam_parse1(&ks, fd->h, &b[i]) < 0) { sam_state_err(fd, errno ? errno : EIO); cleanup_sp_lines(gl); From ed198a55b6a7e6dbcfe53b22f7cc8b3c21086301 Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:50:44 +0200 Subject: [PATCH 006/114] Added test cases for kgetline() and kgetline2(). --- test/test_kstring.c | 118 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/test/test_kstring.c b/test/test_kstring.c index 8dcce6b5b..ee913a2e3 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -261,6 +261,118 @@ static int test_kputw(int64_t start, int64_t end) { return 0; } +// callback used by test_kgetline +static char *mock_fgets(char *str, int num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return str; +} + +static int test_kgetline() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline(&s, mock_fgets, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + +// callback used by test_kgetline2 +static ssize_t mock_fgets2(char *str, size_t num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return strlen(str); +} + +static int test_kgetline2() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline2(&s, mock_fgets2, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + int main(int argc, char **argv) { int opt, res = EXIT_SUCCESS; int64_t start = 0; @@ -301,5 +413,11 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kputw") == 0) if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kgetline") == 0) + if (test_kgetline() != 0) res = EXIT_FAILURE; + + if (!test || strcmp(test, "kgetline2") == 0) + if (test_kgetline2() != 0) res = EXIT_FAILURE; + return res; } From 81eca5e695e957ef6844d619b44246abe91d9eef Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:51:25 +0200 Subject: [PATCH 007/114] Improved documentation of kgetline() and kgetline2() in the header file. --- htslib/kstring.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/htslib/kstring.h b/htslib/kstring.h index ed9bdeeaa..150757ca6 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -109,18 +109,21 @@ extern "C" { HTSLIB_EXPORT char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - /* kgetline() uses the supplied fgets()-like function to read a "\n"- - * or "\r\n"-terminated line from fp. The line read is appended to the - * kstring without its terminator and 0 is returned; EOF is returned at - * EOF or on error (determined by querying fp, as per fgets()). */ - typedef char *kgets_func(char *, int, void *); + /* kgetline() uses the supplied fgets()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * kstring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef char *kgets_func(char *, int, void *); HTSLIB_EXPORT - int kgetline(kstring_t *s, kgets_func *fgets, void *fp); + int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp); - // This matches the signature of hgetln(), apart from the last pointer - typedef ssize_t kgets_func2(char *, size_t, void *); + /* kgetline2() uses the supplied hgetln()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * ksring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef ssize_t kgets_func2(char *, size_t, void *); HTSLIB_EXPORT - int kgetline2(kstring_t *s, kgets_func2 *fgets, void *fp); + int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp); #ifdef __cplusplus } From 2264113e5df1946210828e45d29c605915bd3733 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 7 Oct 2020 13:00:00 +0100 Subject: [PATCH 008/114] Added new test and data for CRLF line endings. --- .gitattributes | 4 + test/index_dos.sam | 190 +++++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 8 ++ 3 files changed, 202 insertions(+) create mode 100644 test/index_dos.sam diff --git a/.gitattributes b/.gitattributes index efd67e585..e46cc5cb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -16,3 +16,7 @@ README.md export-ignore # line separators on Windows machines. It causes the index files to become out # of sync with the fasta files. *.fa* -text + +# Remove the text attribute from index_dos.sam, so that the line separators +# for the test file don't get converted into Unix format. +test/index_dos.sam -text diff --git a/test/index_dos.sam b/test/index_dos.sam new file mode 100644 index 000000000..b006aa77d --- /dev/null +++ b/test/index_dos.sam @@ -0,0 +1,190 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd +@SQ SN:CHROMOSOME_II LN:5000 M5:8e7993f7a93158587ee897d7287948ec +@SQ SN:CHROMOSOME_III LN:5000 M5:3adcb065e1cf74fafdbba1e8c352b323 +@SQ SN:CHROMOSOME_IV LN:5000 M5:251af66a69ee589c9f3757340ec2de6f +@SQ SN:CHROMOSOME_V LN:5000 M5:cf200a65fb754836dcc56b24b3170ee8 +@SQ SN:CHROMOSOME_X LN:5000 M5:6f9368fd2192c89c613718399d2d31fc +@SQ SN:CHROMOSOME_MtDNA LN:5000 M5:cd05857ece6411f40257a565ccfe15bb +@PG ID:bowtie2 PN:bowtie2 VN:2.0.0-beta5 +SRR065390.17240207 16 CHROMOSOME_I 999901 42 100M * 0 0 ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.15493040 0 CHROMOSOME_I 999912 42 100M * 0 0 ACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDBCCBDBCCBDDA@>DC?5@?@@??:><<>8>39<37 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6144221 0 CHROMOSOME_I 999914 42 100M * 0 0 TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8057275 0 CHROMOSOME_I 999916 42 100M * 0 0 CAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTT CCCCCCCBCCC@CCCCCCCCCCC>BBB>BB?4CCCCCC;>====ACCCA@CCCBBCCBC;>@==>BBBBA?<;@<@######################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24679913 16 CHROMOSOME_I 999917 42 100M * 0 0 AAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTT ==56>??>AB?>D>?A?DBDABBB=BDBDACDBBCCDBBBBDDCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.25513175 0 CHROMOSOME_I 999934 42 100M * 0 0 ATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBC@CADCDDAABA=B?=A=B.>AA?AADA########################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17492782 0 CHROMOSOME_I 999935 42 100M * 0 0 TGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCDCCDCCBDCDDBDDBDD@BBBBBBACBBAB=AB>BBBAB>?BA@CAAA? AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17146364 16 CHROMOSOME_I 999942 42 100M * 0 0 CAAAAATTGTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAG #######@/A@@<:BBBBB>ABBDADC@=DDBDDDCDCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:8T91 YT:Z:UU +SRR065390.14459471 16 CHROMOSOME_I 999944 42 100M * 0 0 AAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGC @@@@=B@CCCBAABACCC@DCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6968616 16 CHROMOSOME_I 999947 42 100M * 0 0 ATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTA BDB>B@DDDD@DDDDBCACB@DCBCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9052825 16 CHROMOSOME_I 999952 42 100M * 0 0 GGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACC ?B;DABDABDDBDDADCCCD@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22926164 0 CHROMOSOME_I 999967 42 100M * 0 0 TTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBCCBCCCCCCCCDCCDCDDDDCCDACDCADBDDBBCBCBCCABBA@BABABCBABC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27108093 16 CHROMOSOME_I 999969 42 100M * 0 0 AAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACN ##########AAAAA388333-533')''+AA8AAAAAAAAAA8AAAAAA67788AAAA888887AAA5AAAAAAAAAAAA8AAAAAAAA+*++)))))! AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:99C0 YT:Z:UU +SRR065390.19145675 0 CHROMOSOME_I 999970 42 100M * 0 0 AGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCBCCCCCCCCCCADCBDBBCBBBBBDCBABBBABAABB??DDAACCAACC>AC?C?= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22660118 16 CHROMOSOME_I 999972 42 100M * 0 0 CTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCC B9ABABDB>DBBBD8CBDCDBCDBCDBCBCCBCCCCCCCCCCCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1589310 0 CHROMOSOME_I 999973 42 100M * 0 0 NTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCG !++((22221AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7A8AAAAAAAA8AAAAAAAAAAAAA7A7AA768655 AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.32984687 0 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT CCCCCCCCCCCBCCCCCCCCCCCCCCCCC@CC@CCCBCCCCCCBDACDCC>@B@CDBADB@BCBD@B=BBB@BD>C@BBCBACAABAB;D9<4:<66 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28347129 16 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGACATACGCTTAGGTTTCGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT ##############################################@B?BB@A@ABBBDABD@DDBBB@@B;C@BACBC@CC@CCCCCBCCCCCCCCCCC AS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 NM:i:5 MD:Z:23G1T2G2G7A60 YT:Z:UU +SRR065390.17964692 16 CHROMOSOME_I 999984 42 100M * 0 0 TTTGGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAA #####@<@=<53.830;>.?A5@@?ABAAADBDBC<@CB@D@BCB@CBCDCDBBDC=C@C@CAAC@C@ACCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:3A96 YT:Z:UU +SRR065390.16701032 0 CHROMOSOME_I 999987 42 100M * 0 0 AGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCACCCCCCCCCCDCBCCCCCCDCCBAA@BBBBBC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24060716 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA @8>68BD?B??B@DB>ABB?BA@A=ADBCC@?AA@CCBBCBCCDBCDCCBCBCCC@CCCCBCCCCCCCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24907628 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTGAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA ################################BDDBB?BB>?>BADABBBDBDBABDBDC;?>9=C?B>CC@CCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:9T90 YT:Z:UU +SRR065390.21366278 16 CHROMOSOME_I 999991 42 100M * 0 0 GTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAG ######?9>A09=@?=>BBDBBBB8B>DBCDCCDCBCBCBDCCC@CCCCCCCBCCCCCCC@@CCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27662957 0 CHROMOSOME_I 999995 42 100M * 0 0 GCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTA CCCCCCCCCCCCCCCCCCC@ACCCCCCCCCCCCCCADCCCBC?CDDDDAC=BA?@B@DBDB>?>>D?#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29477959 0 CHROMOSOME_I 999997 42 100M * 0 0 TTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=AB?DAB@3=@8@=@?@ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13030274 16 CHROMOSOME_I 1000208 42 100M * 0 0 TCAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTC 955576>0@BBBBBBDBBD?DABDDDDCD@DCDDCCDCDDCACBACCCCCCBCCCCCCCCCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18054898 16 CHROMOSOME_I 1000209 42 100M * 0 0 CAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCC CAC@CAA?BC?D??BCABB8=>@@?#### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.26866653 16 CHROMOSOME_I 1000217 42 100M * 0 0 CTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACC ###########??????4D;AA?AAD?A>>?CABCBABBBBAA@AD>ADAAC@CCCCBCCBCCC?CCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.23714265 0 CHROMOSOME_I 1000218 0 78M2I20M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGAGATCGGAAGAGCGGTTCAGCAGGAA CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCADDCCBBBBBDBBBB+=7=0?==>A#################### AS:i:-48 XN:i:0 XM:i:16 XO:i:1 XG:i:2 NM:i:18 MD:Z:75T0T1T0T0G0T2T0T0T3C0T0T0T0T0T1C0 YT:Z:UU +SRR065390.20744360 16 CHROMOSOME_I 1000218 42 100M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCG #####@ABBBBDBD@BA@DCDBABBBBBDA>@CBBDBBAD=BBDCBACBCCCCCCCBCBCCCCACCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3611567 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT #####@<2@=BBBBAC=DBBB@BBACBBBB=C;BBCCBACC@CCACCCCBCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10053218 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT @@=@6AA=AAC?CAC>BB>?A>>CBB@@CBAD>CC;>C@BC>A################################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:66A7A14C2A7 YT:Z:UU +SRR065390.21951837 0 CHROMOSOME_I 1000229 42 100M * 0 0 AATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCBCDCACCCCCCBCCB>AACCC@1/?@?CCC@@BABCB=?@@+:A?B###### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21381202 0 CHROMOSOME_I 1000232 40 100M * 0 0 TATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGGTTTTTTTAGTTTTTTCTTTTTTCCCAATTTTTTTGGATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDC?=8@';4@AA############################################# AS:i:-16 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:60A5G8C6A2G7A0G1A3 YT:Z:UU +SRR065390.22184926 16 CHROMOSOME_I 1000235 42 100M * 0 0 TGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCT ??CAACCBAADD?DBB?@>BBB;BABBBBB@>CCCDBCDBACCCCAACACACCACCC@@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17603173 0 CHROMOSOME_I 1000236 42 100M * 0 0 GGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTG CCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCDCCCCCCCCB>CAB@ACCC################################################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17587471 16 CHROMOSOME_I 1000250 42 100M * 0 0 GAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTT 10?8;;?;AA??:AA@BBBBB?BDDDDDBCDA>@DDDCCCDACCCDDCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.635026 0 CHROMOSOME_I 1000255 42 100M * 0 0 TTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGA CCCCCCCCCCCCCCBCCCCCCCBD@CCCCB0:>8:=BBBBC6:=7@>?B?B43/+2>@@/@########## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.33333470 0 CHROMOSOME_I 1000257 42 100M * 0 0 TTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTTTGTTT CCCCBCCCCCCC?CCC?CCCCDBCADCCCCCA@@:;CCCC?7.)8;>???-3>>;A?3?6;/2;>?A:24775=4B<@@<4)+75:70(4@>::)9,B>BB?BBD:>BADDD=ABBBDDDBD@DBCCCDCCDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.18670433 0 CHROMOSOME_I 1000260 40 100M * 0 0 TGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTTTGAATATCTGGGGATTTTTCGTTTTTTTTTTTTT CCCCCCCCCCCCC>CCCCBBC4A@ACCC8@;5/8;A?A/6,>==AAC6<@################################################## AS:i:-14 XN:i:0 XM:i:7 XO:i:0 XG:i:0 NM:i:7 MD:Z:65A0G11A4C7C1G0A5 YT:Z:UU +SRR065390.5800524 0 CHROMOSOME_I 1000261 42 100M * 0 0 GGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTTCCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTT CCCCCCCCCCCCCCCCCDDC*/,0/??/<<508BAA@@BCBCAC?BAADBCD@@@CBCCBA9CCCACCCCCCCCCDCCCCCC?CCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCBBBCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1793614 16 CHROMOSOME_I 1000274 42 100M * 0 0 ACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTT A:CAADB=DBDD@CBACC>@CACCCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.20107270 0 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG CCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCCCCCCCCADDCCCCCDCC?ACACDCCCCC@CCCDCD@BCDCBB3>B@BCCC@@9=3BB?@B@>85; AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8268806 16 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG ##########D?:BBA>;BBABBAABBBBBDDB>DDDDBDCDDCDCDDCCCDCCCDCCCCDCCCCCCCCCCCCCCCCC@BBCCCCCCCCBBBCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.19264263 16 CHROMOSOME_I 1000280 42 100M * 0 0 CACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCC ##BB?>CBABBB?:BBBBABABABB@DBCBBDAABDCCCCCCBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18391831 0 CHROMOSOME_I 1000283 42 100M * 0 0 TTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCC CCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?ACCCCCCCBCC@CC8BBCCCCCB@>A>CCCDDC@@@DBBBC?:CCDBAC;CDDDDCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCC@BBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24029537 16 CHROMOSOME_I 1000284 42 100M * 0 0 TTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCA DB>B8BB<9;?>ABDDAADB@DD@C@BBAABBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCCCDCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28630205 0 CHROMOSOME_I 1000286 42 100M * 0 0 TTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCACG CCCCCCCCCCCCC@BCCCCCCCCCCCCCCCCCDBCCCCCDDBBBCBCDCDB@=?BBBBDBBABBBBBB@@CBBDB>>>A>BCBCCB:;:>=<9:@A#### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:98A1 YT:Z:UU +SRR065390.15799530 0 CHROMOSOME_I 1000295 42 100M * 0 0 TTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCT CCCCCCCCCCCCCCCCCCCCCCAACCCCCCCCCACCBCBCCCC?B@CCCCB@93=@B5>BB>>3/77:7:B>CDBDDB@>;B>BBBBDACAAB@D@<9<9<7 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22494349 0 CHROMOSOME_I 1000297 42 100M * 0 0 TGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACDCCCCCD@@CCDCDCBBDCDDDBADDDDCD>B;@>DAABBB@>5A>BDBB?6??@D?9@####### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12445253 0 CHROMOSOME_I 1000298 42 100M * 0 0 GTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCATAAC CCCCCCCCCCCCCCCCCCCCACCCCCCCCC@DCCCCCCCC?BACCBC@CBDCCACB?BBBCDC@@;4BCBABDC@B56?B@96=4A>BAB;;5;:@19A;@;;;6?BBBBB3BBB??@@@>@BBB;@AA@9@AA9BABBBAA@@AABAABAB@BB:;??>:?DBAB?BBDDBBABB;ACBDB?BBB@CCCBDD@CD@CCDBCDDDCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27194079 0 CHROMOSOME_II 2920 42 100M * 0 0 CTAATTTTCAGAGAGACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC=BBBCB?BBBA?BBBDB?>BB=CBCCAACAC;DAB=ACAC?##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21775125 16 CHROMOSOME_II 2934 42 100M * 0 0 AACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACC #####ABA?=<<=5=@BBA?=@>:A:7.44?B?8B@@>BBB=@B?ADBBBCBBACBD9CBD?A9?=A?.AABADDABBB@BABDDBACBBCCDCBCCDCCCCDCCCCDCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3790175 16 CHROMOSOME_II 2944 42 100M * 0 0 GTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTG 8BDD@:=7)/>B>ABBB?BB?>?DB@B:BBB?BBADDC@BDCDDCDBCDCCCBADCCCCCCCBCCCCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4091455 0 CHROMOSOME_II 2946 42 100M * 0 0 TTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?CCCCCCCCCCDCCCCBCCCDACBCDCACC@C@CA@CBAAD=BBAADD06@##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8676436 0 CHROMOSOME_II 2947 42 100M * 0 0 TAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCCCCGCGGTCCGTGTGC <:>:>/000/:<<:BAB?>8A?A;:A873;3?>?>A>>A8B############################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:84A7A0A5T0 YT:Z:UU +SRR065390.28734084 0 CHROMOSOME_II 2948 42 100M * 0 0 AAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCBCDCBCCCBBDDDCADABADBBABB:BB=D?B<@B@>CA?CA>BACADAA########### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10526869 0 CHROMOSOME_II 2956 40 100M * 0 0 TACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACGCCACCGCGGGGAGGGGGGTTGTTTTAT CCCCCCCCCCCCCCCCCCCCADCBBDDDDDDDBBB8BA@B>6<:>9=789=0>D>AA<@<8B>1>A9>;@5=@8C:48;*AAA=<>9>9>>:>>AB?D>BBDBCBDBBCCBABBB>@CDCCBCDCAACCCCCACCCCCCCCBCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10879394 16 CHROMOSOME_V 938 42 100M * 0 0 TATGTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCA B;B:B>@B?>@>7BBDABADADBBCBDCCBACBCCBBB@CCCCCBCCACACCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1520161 0 CHROMOSOME_V 941 42 100M * 0 0 GTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATG CCCCCCCBBBCCCCCCCCCCCCCCCCCCCCACCCCCCDCC@CCCCCCCCCCCCCCCCCCACCCBCCAD=D@BC?C?C?C##################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17468019 16 CHROMOSOME_V 943 42 100M * 0 0 TTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAA >ABBBABBDDDB=DBCD?DDBDBDADDADDBDCCCCCCC=CCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12403970 0 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCC@CBBCACBC@?144:>><@@DAB?:=9@<>/>9?;=927= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.16193993 16 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC ;;/67AAC@ADCCDBCDCCCCCCD@CCCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.30032741 16 CHROMOSOME_V 950 42 100M * 0 0 GAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCA AACBBAB?BB>BABBCDBBDABDBADDDDBDDBBADDDDBACCDCBDDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.32455256 0 CHROMOSOME_V 956 42 100M * 0 0 GTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAAATCAAAAAAA CCCCACCCCCBB=?ABB?BBA?BAABBBBBB@BBABBBBBBBBBBBBBAA@BBBBBBB>B######################################## AS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:64T25C9 YT:Z:UU +SRR065390.15571530 16 CHROMOSOME_V 966 42 100M * 0 0 CTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTT B;:B;B?D?@?BBBB5-=<:@@AA@BBA>BBADBBDDDDCDCDCDBBDCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCACCBBBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9595122 0 CHROMOSOME_V 967 42 100M * 0 0 TGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCC?@:AAA>C@CBB@@>?B=A?BBBBBCB>@/@>=>=>BB# AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3600239 16 CHROMOSOME_V 969 42 100M * 0 0 ATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCA DB>DBBBBA@AAB?DCA@CB@ABB@BB=AAAA>@==>>6/>:>5:688/85A?AAA>>657==BBB<;;;9>>8>>BBBB> AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:28T25T45 YT:Z:UU +SRR065390.31266674 0 CHROMOSOME_V 971 42 100M * 0 0 AATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTTACTTTGCACG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCADDCCBC@CBC5<5<7?:83;+471/0<4=8;??BBD(.94;9?@?################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89G10 YT:Z:UU +SRR065390.23187971 16 CHROMOSOME_V 972 42 100M * 0 0 ATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGC 647:0BBB?B==@?@@BDBDBBBDDDBDDBDBDDDCBCCCCBBCCCCCDCCBCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28661392 16 CHROMOSOME_V 975 42 100M * 0 0 TGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTAT ACCACAA5BDABAA>BDBDBDCBCBA@DBDB>DBBBBBAABDBDBDDBCCCCDCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCDCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1859967 0 CHROMOSOME_V 979 42 100M * 0 0 AAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGCAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA>=>C<@@;:@A@A=53@?AB::?@CCACC=B/<;53;7BB:>B=::=A@?@?ACCC>C@CCCCCB:/&-7735@B7B>B?;@@CC@35A@@CCBC@######################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12435485 16 CHROMOSOME_V 981 42 100M * 0 0 ATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAA B<=BB>B@>>BBBD@>?DABBBBBDDDDDDDDCADCDCCDCCCDCDBCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.7485987 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGGTAAAA C@?C@CCCCCCCCCCCC@CCCCC@?C8CCC@BC?@CC############################################################### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94T5 YT:Z:UU +SRR065390.17264189 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACCCTATGGTTAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=C=/////=?5=;:@8???AA############################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87G12 YT:Z:UU +SRR065390.6356855 0 CHROMOSOME_V 986 42 100M * 0 0 AATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAAAA CCBCCBCCCCCCCCCC@CC?@CCCCC@CCC>A=@.88/45+()/.=>2==BBCB659?9?'))10;9??############################### AS:i:-7 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:34T63T0G0 YT:Z:UU +SRR065390.20107175 0 CHROMOSOME_V 989 42 100M * 0 0 TTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@C@?./..):3872A=@=A<=:;=B>B>>87777@>&@9A@@@8:@>88 AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87T12 YT:Z:UU +SRR065390.6431660 16 CHROMOSOME_V 994 42 100M * 0 0 ACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATT AACBABABDC@@ADABBDDCDCDBCDDDCCDCDBCACCCBCCDCCCCCCCCBCC@@@CCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4439503 16 CHROMOSOME_V 997 42 100M * 0 0 TAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTT ###########B>:AAAAA@C@=;937<ACCC8@@@AABCC>@+/662BBBC?B>BBB?BBBB#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.11492188 0 CHROMOSOME_V 998 42 100M * 0 0 AAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTG CCCCCCCCCCCCCCCCDCCCCC>A@AAAAAACA??B@@BBD>BACACC08;;AAACB==/*/1//:=@99BBABA@;<@;<:9>>B??>B??:?6B??B9 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9605367 0 CHROMOSOME_V 999 42 100M * 0 0 AACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGG CCCCCCCCCCCCCCCDCCCCC@C<>>A9<4=9>=B###################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29302896 0 CHROMOSOME_V 1000 42 100M * 0 0 ACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGGT CCCCCCCCCCCCCCDCCCCC6?:??AABCCCC8?C@BCCCC@@5;><9>>>B>>AB=<)6=4:):9>>@@################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13754 4 * 0 0 * * 0 0 TCGCTGCTGTGATGTTGCGTTTTTATCAGCACAAAGGCGGTCAGGCCGAGGCCTATTTTTTCCGGATCCAGCAGGGCGACTTTGCCGATAAGGATACCGT CCCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCC@CCCCCDCCCCCCCDCCCC@ACCCC@>>CCD?>>>@@@ YT:Z:UU +SRR065390.13765 4 * 0 0 * * 0 0 CGTGGTCGTGCCGGTTACAAGCCTGCCGTGAAAAGCCGTTTCAGTAAGTCAGCCAATAGCAAATTCTCCCATACTATCGCTTTTGCCTGATCCTGAACTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDCCACCCCCCCC@CCCCCCCCCACCCBB@?CBDABDDADB<=ABBB@B@BB@ YT:Z:UU +SRR065390.13778 4 * 0 0 * * 0 0 TTTTATACCAACAAAAAACGGAAAGCAGATAACCCAGCAGCCCGAGTAACAGTATCCGGGCATCCAGGCCAAAAGCTAACAGAGCCGCGATAAAATCCCA CCCCCCCCCCCCCCBBBBBCCCCCCCBCCCCCCCCCBCCCCCCCCCDCCCCCACCCCCCCCCDCCCCDCBCA@AC>@=@CC?B>CBBCC>=?8A8=?>66 YT:Z:UU +SRR065390.13779 4 * 0 0 * * 0 0 ATAATGGACAACTTTAATGGCAATCACTAAATCAACTCCGGCACCATTAACCGGTGGGACGTTATGGTGCGTCACTATTGCATTGTCATTAGCGACATTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCC=CC?BCCBCCACBABCCDCDBDADBBCDCBD>DBBD==BAA:>5<> YT:Z:UU +SRR065390.13802 4 * 0 0 * * 0 0 AAGGCGTTTATTATATACACTCGCATGGCTTTTCTTCTGAAAATGTAGAATAATTGAGTAATTTTTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ??B:??????BBB>B99;;;>B>>>:BBBBB;;B=70///0-/01/BA>BABB>B>BBB@BB>>ABABA>BA>6BB88183,<8;<4>:@><>A>> YT:Z:UU +SRR065390.13808 4 * 0 0 * * 0 0 GTTTGCTGACTGGCCAGCCAGCTCAAGGCATCAAAAGCGTCTTTGAGAGGGAAAGGAATATCGATAACCCGAATGCCGGGTGGCGCCGGAATCTCTTGCG CCCCBCCCBCCC@CCC@CCCAA:A=BB?BBBB@>B>A#################### YT:Z:UU +SRR065390.13853 4 * 0 0 * * 0 0 CTGGTACGTCACCACACGCCGCGATGGCGTCATCCACCGACTTCACCCACGTTACGCGATCGTCCGTACCCGGGTGACCGTTGGGGATAATATTTTTGCG #################################################################################################### YT:Z:UU +SRR065390.13861 4 * 0 0 * * 0 0 TTCAGAAACTGGATGAACAGTGCGCAGCCATCTGCAAATATGAATTAGTTCAAGTCACTCAAAAGCTATTTATTTGAATGGAAGAAATTTTTGAACTATA CCCCCCCCCCCBCCB@@CCCBCCBCCCCCCCCCCCCC?CCCC@CCC@C@CCC@CACCBCC?BBBC@C7CBCBCB@@ABCCBBBC=BABCCBBBBAB@@CA YT:Z:UU +SRR065390.13907 4 * 0 0 * * 0 0 CATTACCATTCAGTTGTATTGTTTGCGCACCAGAAAAATGAGACTGCACAGAATAAATTATACTGACCAGAAATTGTAAAATTCGTATATTCTTATTCAT 8998;9:;9;>9:9>?BABBAAA2A@@@@@>:3'3A################################################################ YT:Z:UU +SRR065390.13946 4 * 0 0 * * 0 0 TTTCCTCGAGTTCTTGATGAAATGGTCCATTATTTGTCAACCATTTATTTTTCCATATTTTTTCCAGGTAAGGCATGAATTCTGCAAGTTCCGGCAAAGA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCBBBBCACCCCCCB@BCCCCBC?CCCCACCBCBCCCCC@BBDCDDBCA4@@A YT:Z:UU +SRR065390.13956 4 * 0 0 * * 0 0 CGGCGCAACAATACTCAGCAGTTAATTGCAAAGGTATCGCACACCATTAAAAGCATTAAGCCGGGAGTCGAATTTGGTGTTAGCCCGGCAGGCGTGTGGC CCCCCCCCCCCBCCCCCCCCCDCCCCCCCCCCCC>CCCCCCCCCCCDCCCCBDCCDDCBDCC@?@BA@B@B>BBABAABB6?BB>B@?B??2?=+>->60 YT:Z:UU +SRR065390.13964 4 * 0 0 * * 0 0 NTTGAGGTGCTCCAGTGGCTTCTGTTTCTATCAGCTGTCCCTCCTGTTCAGCTACTGACGGGGGGGTGCGCAACGGCAAAAGCACCGCCGGGCATCAGCG !))))++++*AAAAA8AAAA################################################################################ YT:Z:UU +SRR065390.13969 4 * 0 0 * * 0 0 CGGGCGATAGTCAAAAACTTATTTTCACAATTTTCGGCTAGGGAGTATATTTACAGTTAATTTGCGATGTGTTAGATCGGAAGAGCGGTTCAGCAGGAAT CCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCACCCCCCCCCCDCCCACCCCCDCCACBCCCCCCBCBBCDCBCC?BCBBCBCBC;A YT:Z:UU +SRR065390.13978 4 * 0 0 * * 0 0 AGACGGTAACTTTCAATTTGCACCCATGATTAAATTTTATGTTGATTAAAATAGAAGCAAAAATCATTACATTACACTACAAAATACGCCGAAATGTTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBACCCDCBCCCABCADCCAABC? YT:Z:UU +SRR065390.13985 4 * 0 0 * * 0 0 TAACCAAAAACTGGATTATGCAAATAACTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGTATGCCGTCTTCTGCTTGAAAAAAAAA CCCCCCCCCCCCCC?CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCBCCDCDCCD@CBD5>@=:=><9A@3>=B?BB>CBACB?BBA YT:Z:UU +SRR065390.14000 4 * 0 0 * * 0 0 TAGGTGAGAAAAGCGTTATTGGTCCGGTATACCTGCGAAGCGACAAAGCAATAAGGCAACAATGGCAGGTAATGCTGCTCAAAAAAGCGTTTACTGATCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBADCCCB?@B>B@BADAAABBD@C5;B9?:?;ACABAB YT:Z:UU +SRR065390.14032 4 * 0 0 * * 0 0 GAAGGTCCAAGTGCCTTGAAGATAGAAAATTATAGCATTTCTCTTTAATTTCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGCATGA CCCACCCCCC>AAAACBCCCCCCCBCC?CCCCCCCCCCCCCCCCCCCCCCCB@CCCCCCBAC@CDCA/@B<;8=?@B>BC>?>?BB=:A########### YT:Z:UU +SRR065390.14061 4 * 0 0 * * 0 0 TGAAGCCGACAATTTGAGGCCAAACATCTTACATTCGACAGTAAATATTTGGGGATTAAGACTTATGTTAGATCGGAAGAGCGGTTCAGCAGGAATGCCG CCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCCCCCCCCADCBCCCC=CCCCCCCBCCC=CBCCCCCCCABCCCCCCBACBC@CCBB;@B;?A@A@=?99A YT:Z:UU +SRR065390.14072 4 * 0 0 * * 0 0 TGAGTGAGGCTCAGGATTTTGAGTGAGGCTCAGGATTATGAGTGAGGTTGAAGAATTTGAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAG @B=@@BB@B@<@BB>BB>>@BB@==2;:;8BBBBBB@B@@@:@?1B@B@B@3@@@>3;@;@<@?>;@B@@##### YT:Z:UU +SRR065390.14100 4 * 0 0 * * 0 0 AAGCCTGAGGGATAATTTTCGTCAAATTAAGGCAATTGCCGAGTGTTTCATCCCTGGCAAGCAGAACGGCTTTTTCGTTATTTATATCGGGAGAATTTAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCDCDCDCCCCDBBCDADBBBABDDBBBBBBBBDCBCD?BABB>B>AB>BCABAA>C YT:Z:UU +SRR065390.14105 4 * 0 0 * * 0 0 AAATTGTCCCCAAATAAAACAATTCCAGTGATCTTCCGATTCTAGGTGCCAAATAACCCAAATAGTCACTGCATTAGTTTTTATCTCACTTTTCTCCCCC #################################################################################################### YT:Z:UU +SRR065390.14107 4 * 0 0 * * 0 0 TGAAATTTCAAGAAAAATGTTAATTACCACCGTATTAAAAAAAAAAAACTTAAAATCAAAGATCGGAAAAGGGGTCAGGCAGGAATGCCAAAACCGACAC CCCCBCCCCCCCBCB>>>ACCCBCACCC?CCCCCCCCCCC?B########################################################## YT:Z:UU +SRR065390.14137 4 * 0 0 * * 0 0 CTGTGGCGTTTTTATCAAATTGGCAGAGCCACGTTCAGAGCTGAAAAAGCCACAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTTGGA BCCC@CCCCC?CCC>>CCBCCC?>C@CCCB=6?AA>=>3?>@?@86;86.@A@==378::68829>B9B############################### YT:Z:UU +SRR065390.14141 4 * 0 0 * * 0 0 GGTCACCAATCATAAGAGGAACAGCGACTGCACCTGCGTACATGACAAGGACGTGTTGCAGACCGAGTATGATCAGCTTTCCTGGTGATAGTATGCGCTC AAA@A?AA8:>A######################################################################################## YT:Z:UU +SRR065390.14162 4 * 0 0 * * 0 0 ATACTTCACCGGATGGTGGAATTAACGAAAACAACAACTGGTGTCACATCCCGCAGGCAAAAGAGGCAGCGGCTAACTAAGCGGCCTGCTGACTTTCTCG CCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCBBCCCCCCDCDCCCCCCCCCC?C?CCCCCACD@CAD@AB<>@CB;6B#################### YT:Z:UU +SRR065390.14168 4 * 0 0 * * 0 0 TCGAGGGTGAGGGCGTCTGCCAGATCGGAAGAGCGGGTCAGCAGGAATGCCGAGACCGATATCGGATGCCGTCTCCTGCTGGACAAAAAATGAGAATGGG AACC@0@>@6:<>??>?BBBBB?+B6BBB>B?B=:?BBB=BBBBB>B######################################## YT:Z:UU +SRR065390.14173 4 * 0 0 * * 0 0 AAGAAACTCAACAAACCGGACTTGCAGGTGAAACTGATTCCGATTACCTCACAAAACCGTATTCCACTGCTGCAAAACGGCACTTTCGATTTTGAATGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=@CCCCCCCCAA?ABC@CCA=CCCABDCCAABDD?BB@BBA YT:Z:UU +SRR065390.14178 4 * 0 0 * * 0 0 GCGCTTTGTTTACCTGATACGGAATTTCGTGGACGATAATGGTTTCACGACCGGTTTTGGCGTCAACTTCCACTTCTGCGCGAGCGCGGATATACACCTT BCCCCCCCCC@@CCCCCCCCCCCCCCCCCCA=AA>AAA=ACCBCCCCC?CCCCAB@ACCC?A?<CB?=CAB9B@BA################### YT:Z:UU +SRR065390.14182 4 * 0 0 * * 0 0 ATTTACTCTAATGTTCTGAAAAATAATTTACTCTAATGTTCTGCCAAATAATTTACTCTAACGTTCTGCCAAATAATTTACTCTAATGTTCTGCCAAATA CCCCCCCCCBCCCCCCCBCC@CCB@@@BCCCCCCC@CCCCCCBBCCCCCCBCCCC@CCC?CCC>>CCBCCCCA@CCCC;CBCCBDCCB@CCBCAACB@BB;B?B0B=8??9>??BB>B?@?B>A>A########## YT:Z:UU +SRR065390.14197 4 * 0 0 * * 0 0 GTACCTCGCCGTTGTTCTCGACCTGTTCGCAAGAAAACCAGTGGGCTGGGCCATGTCGTTCTCGCCGGACAGCAGGCTCACCATGCAAGCGCTGGAAATG CCCCCCCCCCCCCDCBB=B@?BB@BBBBBB@@@B@B==BBB9B@@@@B@=BBB@BBB=@BBABBB@@@BB<@BA@BBB=B;B?BBACA YT:Z:UU +SRR065390.14284 4 * 0 0 * * 0 0 CGGTGCATGATGCGGATTCCAGGAATCAACGTACAGCGTCGGGCTAAACCAGAACCAGCCAATAATGCACAGACCGACGACCGGAATAATAACCCCCCAC BCACCCCCCCCBCCBCCCCCCCBCC@CCCCCCCAC@CBACCACCCCC@CBCCCCCCCA8CC?A@9@AB@9CACC8=81B@CC9CCCCCCC,<8??CBC@BB?@C@ACBCB################################################## YT:Z:UU +SRR065390.14312 4 * 0 0 * * 0 0 ACAGTAACATTCAACGTTAAATATGTTAATAAGACGTTGCATTATTGTCCTGAAGTTGAAGATAGCAGGTATGGCGGTTGGATAGCACGGCGTTGGTTTA CCCCCCCCCCCCCCCCCCCCBCCCCCCBCCCCCCCCCCCCCCCCCCCBCCCCCDCCCCCCCCCACCBC@?CACBC######################### YT:Z:UU +SRR065390.14331 4 * 0 0 * * 0 0 GAATAATGAAGATGATGCGACGCGTCTGGCGCGTTTGAACGAACGCTTTAAACGCGAAGGTAAACCGGAGTTGAAGAAGCTGGATGATCTACCTAAAGAT CCCCCCCCCCCCCCBCCBCCCCCCBCCCC@CCCB@CCCCACCDDCBCC?CAC@B@DABA?BAB@@@?C?C@BC?9A::>=@@C;?############### YT:Z:UU +SRR065390.14335 4 * 0 0 * * 0 0 TCCATTTGATGAACCTGAAGTTTAAGTATTGACTTGAGAGGAAAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCT CCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC@BBA>CCCCCCCCBBCCBC=CBCCCB################### YT:Z:UU +SRR065390.14342 4 * 0 0 * * 0 0 AAGTTCATGAATTAAAGCCGACTCAAACACTCTGTTTAAAAACTGGATAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAGATCGTAGGCCGTC 0000079;9;AAAA?;;;>9>3>9BB8BBBB@############################ YT:Z:UU +SRR065390.14359 4 * 0 0 * * 0 0 GCATCAGTACGATAAAACGCGTACCGAACTACTGAATGATGTCGCAGGGGCGCTGGCTCTTGATGACAAACTCGGACGTAGCACCAATCAACTTTCCGGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBABACCBB@BBBBBB>BDBBDBBB>B@@@>>?BCBAC?CBC?> YT:Z:UU +SRR065390.14364 4 * 0 0 * * 0 0 GGTCGCCGATCCGATTTGCACTTTAACCACTTTCGGTAAAGAAACCGTTGTTAGTGAAAGCGAAAAACGCACAACGACCACTGATGACCCGCTACAGGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCBCCCCCCCCCACCDACCBCACA@CACCAA=B=CBC=ACAAAC=)>? YT:Z:UU +SRR065390.14392 4 * 0 0 * * 0 0 GTTATCCTTTTCCGTGATATGTGCGGTACTGCAGCGTATGCCGGCAAGGGTTGCAAACGGTGGTAGTGTGCAGGTTGACTGTTGGTCGGATTCCTCCACC CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCDCCCCCCCCCBCCCA@CCB@AACB?CB?BA=A8@BACB?:===@C@-A6==?@@<@@AA##### YT:Z:UU +SRR065390.14393 4 * 0 0 * * 0 0 AGAAATTTACTGGCTCGCCGCAGCCAACTCCTCTTCTGACACCCCGGTAAAGCGCATGATGTCTGTAAGAGGGGCCCCGGATTCAAGCATTATTTTGGCT CCCCCCCCCCCCCCCBA9::<4A>AAAA:?A#################################### YT:Z:UU +SRR065390.14434 4 * 0 0 * * 0 0 GGTAGATTCCCATAAAAATCGCCAGCGGAATGGTGAACGCAACGGTATACGTTCCCCACGGGCTATGAGTCAGGGCTTTCACCACGATCATCGCCAGTAC DCACCCBCCCCCCCC>CBBCCCCCCCCCCCCCC?CCCCCCCCCCCACACCC@BCCCCBCD=ABB@BCBD?@@B6BC8B@B>BABCBB@AB=@2C###### YT:Z:UU diff --git a/test/test.pl b/test/test.pl index 2c4a1b1f4..1fb6112bd 100755 --- a/test/test.pl +++ b/test/test.pl @@ -762,6 +762,14 @@ sub test_index unlink("$$opts{tmp}/index.sam.gz.bai"); test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # SAM DOS LINE ENDINGS (\r\n) + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 14 -x $$opts{tmp}/index.sam.gz.csi $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + unlink("$$opts{tmp}/index.bam.bai"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 0 -x $$opts{tmp}/index.sam.gz.bai $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + unlink("$$opts{tmp}/index.sam.gz.bai"); + test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # CRAM local $ENV{REF_PATH} = $$opts{m5_dir}; test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -C -x $$opts{tmp}/index.cram.crai $$opts{path}/index.sam > $$opts{tmp}/index.cram", "$$opts{tmp}/index.cram.crai", "$$opts{path}/index.cram.crai", gz=>1); From 49059d3cbb1ac145fadbafed18bce2a8eb5dc0d9 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 7 Oct 2020 14:53:44 +0100 Subject: [PATCH 009/114] Adjust the min offset of a bin when unmapped placed reads detected. --- hts.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/hts.c b/hts.c index afa71987f..278ecd36a 100644 --- a/hts.c +++ b/hts.c @@ -2552,6 +2552,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t bidx_t *bidx; uint64_t min_off, max_off; hts_itr_t *iter; + uint32_t unmapped = 0, rel_off; // It's possible to call this function with NULL idx iff // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE @@ -2585,13 +2586,20 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t return NULL; } + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; iter->readrec = readrec; if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } + rel_off = beg>>idx->min_shift; // compute min_off - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2602,10 +2610,28 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } while (bin); if (bin == 0) k = kh_get(bin, bidx, bin); min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + // min_off can be calculated more accurately if the + // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) + min_off = kh_val(bidx, k).list[0].u; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).list[0].u; + } // compute max_off: a virtual offset from a bin to the right of end bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; @@ -2688,6 +2714,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int tid; hts_pos_t beg, end; hts_reglist_t *curr_reg; + uint32_t unmapped = 0, rel_off; if (!idx || !iter || !iter->multi) return -1; @@ -2720,6 +2747,12 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) continue; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; + for(j=0; jcount; j++) { hts_pair32_t *curr_intv = &curr_reg->intervals[j]; if (curr_intv->end < curr_intv->beg) @@ -2727,12 +2760,13 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) beg = curr_intv->beg; end = curr_intv->end; + rel_off = beg>>idx->min_shift; /* Compute 'min_off' by searching the lowest level bin containing 'beg'. If the computed bin is not in the index, try the next bin to the left, belonging to the same parent. If it is the first sibling bin, try the parent bin. */ - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2747,9 +2781,25 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) // min_off can be calculated more accurately if the // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) + min_off = kh_val(bidx, k).list[0].u; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).list[0].u; + } // compute max_off: a virtual offset from a bin to the right of end bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; From c9175183c42382f1030503e88ca7e60cb9c08536 Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Fri, 23 Oct 2020 15:43:26 +0200 Subject: [PATCH 010/114] Add bam_set1() function and unit tests (PR #1159) --- htslib/sam.h | 31 +++++ sam.c | 122 +++++++++++++++++++ test/sam.c | 322 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 475 insertions(+) diff --git a/htslib/sam.h b/htslib/sam.h index 9e595ae12..8fcf819bf 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1003,6 +1003,37 @@ bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED; HTSLIB_EXPORT bam1_t *bam_dup1(const bam1_t *bsrc); +/// Sets all components of an alignment structure +/** + @param bam Target alignment structure. Must be initialized by a call to bam_init1(). + The data field will be reallocated automatically as needed. + @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used. + @param qname Query name, may be NULL if l_qname = 0 + @param flag Bitwise flag, a combination of the BAM_F* constants. + @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME). + @param pos 0-based leftmost coordinate. + @param mapq Mapping quality. + @param n_cigar Number of CIGAR operations. + @param cigar CIGAR data, may be NULL if n_cigar = 0. + @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT). + @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT). + @param isize Observed template length ("insert size") (a.k.a. TLEN). + @param l_seq Length of the query sequence (read) and sequence quality string. + @param seq Sequence, may be NULL if l_seq = 0. + @param qual Sequence quality, may be NULL. + @param l_aux Length to be reserved for auxiliary field data, may be 0. + + @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. +*/ +HTSLIB_EXPORT +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux); + /// Calculate query length from CIGAR data /** @param n_cigar Number of items in @p cigar diff --git a/sam.c b/sam.c index 321787089..49b1a0f3f 100644 --- a/sam.c +++ b/sam.c @@ -485,6 +485,128 @@ static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, } } +static int subtract_check_underflow(size_t length, size_t *limit) +{ + if (length <= *limit) { + *limit -= length; + return 0; + } + + return -1; +} + +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux) +{ + // use a default qname "*" if none is provided + if (l_qname == 0) { + l_qname = 1; + qname = "*"; + } + + // note: the qname is stored nul terminated and padded as described in the + // documentation for the bam1_t struct. + size_t qname_nuls = 4 - l_qname % 4; + + // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). + // can't use bam_endpos() directly as some fields not yet set up. + hts_pos_t rlen = 0, qlen = 0; + if (!(flag & BAM_FUNMAP)) { + bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); + } + if (rlen == 0) { + rlen = 1; + } + + // validate parameters + if (l_qname > 254) { + hts_log_error("Query name too long"); + errno = EINVAL; + return -1; + } + if (HTS_POS_MAX - rlen <= pos) { + hts_log_error("Read ends beyond highest supported position"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { + hts_log_error("Mapped query must have a CIGAR"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { + hts_log_error("CIGAR and query sequence are of different length"); + errno = EINVAL; + return -1; + } + + size_t limit = INT32_MAX; + int u = subtract_check_underflow(l_qname + qname_nuls, &limit); + u += subtract_check_underflow(n_cigar * 4, &limit); + u += subtract_check_underflow((l_seq + 1) / 2, &limit); + u += subtract_check_underflow(l_seq, &limit); + u += subtract_check_underflow(l_aux, &limit); + if (u != 0) { + hts_log_error("Size overflow"); + errno = EINVAL; + return -1; + } + + // re-allocate the data buffer as needed. + size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; + if (realloc_bam_data(bam, data_len + l_aux) < 0) { + return -1; + } + + bam->l_data = (int)data_len; + bam->core.pos = pos; + bam->core.tid = tid; + bam->core.bin = bam_reg2bin(pos, pos + rlen); + bam->core.qual = mapq; + bam->core.l_extranul = (uint8_t)(qname_nuls - 1); + bam->core.flag = flag; + bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); + bam->core.n_cigar = (uint32_t)n_cigar; + bam->core.l_qseq = (int32_t)l_seq; + bam->core.mtid = mtid; + bam->core.mpos = mpos; + bam->core.isize = isize; + + uint8_t *cp = bam->data; + strncpy((char *)cp, qname, l_qname); + int i; + for (i = 0; i < qname_nuls; i++) { + cp[l_qname + i] = '\0'; + } + cp += l_qname + qname_nuls; + + if (n_cigar > 0) { + memcpy(cp, cigar, n_cigar * 4); + } + cp += n_cigar * 4; + + for (i = 0; i + 1 < l_seq; i += 2) { + *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; + } + for (; i < l_seq; i++) { + *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; + } + + if (qual) { + memcpy(cp, qual, l_seq); + } + else { + memset(cp, '\xff', l_seq); + } + + return (int)data_len; +} + hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { int k; diff --git a/test/sam.c b/test/sam.c index 1c65c4556..8a6001840 100644 --- a/test/sam.c +++ b/test/sam.c @@ -69,6 +69,11 @@ static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) status = EXIT_FAILURE; } +#define VERIFY(test, message) if (!(test)) { \ + fail("%s: %s", __func__, (message)); \ + goto cleanup; \ +} + uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) { uint8_t *p = bam_aux_get(aln, tag); @@ -1837,6 +1842,312 @@ static void test_mempolicy(void) } } +static void test_bam_set1_minimal() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + // expected number of bytes written is qname: 4, cigar: 0, sequence: 0, qual: 0, aux: 0. + VERIFY(r == 4, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 4, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 2, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), "*") == 0, "qname not set correctly."); + + VERIFY(bam->core.pos == 0, "pos not set correctly."); + VERIFY(bam->core.tid == -1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(0, 1, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 0xff, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FUNMAP, "flag not set correctly."); + VERIFY(bam->core.n_cigar == 0, "n_cigar not set correctly."); + VERIFY(bam->core.mtid == -1, "mtid not set correctly."); + VERIFY(bam->core.mpos == 0, "mpos not set correctly."); + VERIFY(bam->core.isize == 0, "isize not set correctly."); + VERIFY(bam->core.l_qseq == 0, "l_seq not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_full() +{ + const char *qname = "!??AAA~~~~"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, strlen(qname), qname, + BAM_FREVERSE, 1, 1000, 42, + sizeof(cigar) / 4, cigar, 2, 2000, 3000, + strlen(seq), seq, qual, 64); + // expected number of bytes written is qname: 12, cigar: 12, sequence: 5, qual: 10, aux: 0. + VERIFY(r == 39, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 12, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 1, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), qname) == 0, "qname not set correctly."); + + VERIFY(bam->core.n_cigar == sizeof(cigar) / 4, "n_cigar not set correctly."); + VERIFY(memcmp(bam_get_cigar(bam), cigar, sizeof(cigar)) == 0, "cigar not set correctly."); + + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + } + VERIFY(memcmp(bam_get_qual(bam), qual, strlen(seq)) == 0, "qual not set correctly."); + + VERIFY(bam->core.pos == 1000, "pos not set correctly."); + VERIFY(bam->core.tid == 1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(1000, 1010, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 42, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FREVERSE, "flag not set correctly."); + VERIFY(bam->core.mtid == 2, "mtid not set correctly."); + VERIFY(bam->core.mpos == 2000, "mpos not set correctly."); + VERIFY(bam->core.isize == 3000, "isize not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + VERIFY(bam->m_data - bam->l_data >= 64, "not enough memory allocated for aux data."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_even_and_odd_seq_len() +{ + const char *seq_even = "TGGACTACGA"; + const char *seq_odd = "TGGACTACGAC"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_even), seq_even, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_even), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_even); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_even[i]], "seq not set correctly."); + } + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_odd), seq_odd, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_odd), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_odd); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_odd[i]], "seq not set correctly."); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_with_seq_but_no_qual() +{ + const char *seq = "TGGACTACGA"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, + BAM_FUNMAP, 0, 0, 0, + 0, NULL, 0, 0, 0, + strlen(seq), seq, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + VERIFY(bam_get_qual(bam)[i] == 0xff, "qual not set correctly"); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_qname() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // qname too long + const char too_long[255] = { 'A' }; + r = bam_set1(bam, sizeof(too_long), too_long, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_seq() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // seq too long + const char *sequence = "C"; + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, (size_t)INT32_MAX + 1, sequence, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_cigar() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // mapped query must have a CIGAR + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, 0, NULL, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // pos + ref len from CIGAR should be <= HTS_POS_MAX + r = bam_set1(bam, 0, NULL, 0, -1, HTS_POS_MAX - 10, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // query len from CIGAR should match the sequence length + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_size_limits() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // very long sequence. each base counts for 1/2 byte of sequence data and + // 1 byte of sequence quality data. the sum of all components may not exceed + // INT32_MAX, which is the maximum possible value that can be stored in l_data. + // In this case the 4 bytes of qname will cause it to overflow. + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 2 * (size_t)INT32_MAX / 3, seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long CIGAR + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, (size_t)INT32_MAX / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long aux + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, INT32_MAX); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_write_and_read_back() +{ + const char *qname = "q1"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + + int r; + htsFile *writer = NULL, *reader = NULL; + sam_hdr_t *w_header = NULL, *r_header = NULL; + bam1_t *w_bam = NULL, *r_bam = NULL; + kstring_t ks = KS_INITIALIZE; + + // open file for writing + writer = hts_open("test_bam_set1_write_and_read_back.bam", "wb"); + VERIFY(writer != NULL, "failed to open bam file for writing."); + + // write header + w_header = bam_hdr_init(); + VERIFY(w_header != NULL, "failed to initialize bam header."); + r = sam_hdr_add_line(w_header, "SQ", "SN", "t1", "LN", "5000", NULL); + VERIFY(r == 0, "failed to add SQ header line."); + r = sam_hdr_write(writer, w_header); + VERIFY(r == 0, "failed to write bam header."); + + // write alignments + w_bam = bam_init1(); + VERIFY(w_bam != NULL, "failed to initialize BAM struct."); + r = bam_set1(w_bam, strlen(qname), qname, + BAM_FREVERSE, 0, 1000, 42, + sizeof(cigar) / 4, cigar, 0, 2000, 3000, + strlen(seq), seq, qual, 64); + VERIFY(r >= 0, "call to bam_set1() failed."); + r = sam_write1(writer, w_header, w_bam); + VERIFY(r >= 0, "failed to write alignment."); + bam_destroy1(w_bam); + + // close file + r = hts_close(writer); + VERIFY(r == 0, "failed to close bam file for writing."); + sam_hdr_destroy(w_header); + + // open file for reading + reader = hts_open("test_bam_set1_write_and_read_back.bam", "rb"); + VERIFY(reader != NULL, "failed to open bam file for reading."); + + // read header + r_header = sam_hdr_read(reader); + VERIFY(r_header != NULL, "failed to read bam header."); + r = sam_hdr_find_tag_id(r_header, "SQ", NULL, NULL, "SN", &ks); + VERIFY(r == 0, "failed to read SQ/SN value"); + VERIFY(strcmp(ks_c_str(&ks), "t1") == 0, "expected reference sequence name in the header == 't1'"); + VERIFY(r_header->n_targets == 1, "expected number of reference sequences == 1"); + VERIFY(strcmp(r_header->target_name[0], "t1") == 0, "expected reference sequence name == 't1'"); + VERIFY(r_header->target_len[0] == 5000, "expected reference sequence length == 5000"); + + // read alignments + r_bam = bam_init1(); + VERIFY(r_bam != NULL, "failed to initialize BAM struct."); + r = sam_read1(reader, r_header, r_bam); + VERIFY(r >= 0, "failed to read alignment."); + VERIFY(strcmp(bam_get_qname(r_bam), qname) == 0, "qname does not match."); + VERIFY(r_bam->core.n_cigar == sizeof(cigar) / 4, "cigar length does not match."); + VERIFY(memcmp(bam_get_cigar(r_bam), cigar, sizeof(cigar)) == 0, "cigar data does not match."); + VERIFY(r_bam->core.l_qseq == strlen(seq), "sequence length does not match."); + + r = sam_read1(reader, r_header, r_bam); + VERIFY(r < 0, "expected no more alignments."); + bam_destroy1(r_bam); + + // close file + r = hts_close(reader); + VERIFY(r == 0, "failed to close bam file for reading."); + sam_hdr_destroy(r_header); + +cleanup: + ks_free(&ks); +} + int main(int argc, char **argv) { int i; @@ -1864,5 +2175,16 @@ int main(int argc, char **argv) set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); + hts_set_log_level(HTS_LOG_OFF); + test_bam_set1_minimal(); + test_bam_set1_full(); + test_bam_set1_even_and_odd_seq_len(); + test_bam_set1_with_seq_but_no_qual(); + test_bam_set1_validate_qname(); + test_bam_set1_validate_seq(); + test_bam_set1_validate_cigar(); + test_bam_set1_validate_size_limits(); + test_bam_set1_write_and_read_back(); + return status; } From 90f27abcb669373e8af7cfeca22951109e0e102b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 23 Oct 2020 15:21:51 +0100 Subject: [PATCH 011/114] Use temporary filename for bam_set1() test file The bam_set1() tests create a temporary BAM file. Rename it to match test/*.tmp.* so that it is automatically git-ignored and cleaned. --- test/sam.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/sam.c b/test/sam.c index 8a6001840..af1dc34da 100644 --- a/test/sam.c +++ b/test/sam.c @@ -2075,6 +2075,7 @@ static void test_bam_set1_write_and_read_back() const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; const char *seq = "TGGACTACGA"; const char *qual = "DBBBB+=7=0"; + const char *temp_fname = "test/test_bam_set1_write_and_read_back.tmp.bam"; int r; htsFile *writer = NULL, *reader = NULL; @@ -2083,7 +2084,7 @@ static void test_bam_set1_write_and_read_back() kstring_t ks = KS_INITIALIZE; // open file for writing - writer = hts_open("test_bam_set1_write_and_read_back.bam", "wb"); + writer = hts_open(temp_fname, "wb"); VERIFY(writer != NULL, "failed to open bam file for writing."); // write header @@ -2112,7 +2113,7 @@ static void test_bam_set1_write_and_read_back() sam_hdr_destroy(w_header); // open file for reading - reader = hts_open("test_bam_set1_write_and_read_back.bam", "rb"); + reader = hts_open(temp_fname, "rb"); VERIFY(reader != NULL, "failed to open bam file for reading."); // read header From ca25304f29a5f94198c186b3a47eceab18416f63 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 2 Nov 2020 16:02:24 +0000 Subject: [PATCH 012/114] Update Windows image version in .appveyor.yml Mainly to fix pacman, which broke following changes to msys2 packaging (see https://www.msys2.org/news/#2020-06-29-new-packagers) Prior to this we were using the default image (Visual Studio 2015) which has not had the necessary updates to make pacman work. --- .appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 14145cf1b..6f94944bc 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -18,6 +18,9 @@ skip_tags: true # - docs/* # - '**/*.html' +# Appveyor Windows images are based on Visual studio version +image: Visual Studio 2019 + # We use Mingw/Msys, so use pacman for installs install: - set HOME=. From b211b4c04dccad6f0dc0b9865be7d0210f2a6957 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 2 Nov 2020 16:50:59 +0000 Subject: [PATCH 013/114] Fix a long-standing typo in hts_lrand48. The return type of the definition was wrong, causing callers using the function to get incorrect results in some cases, mainly depending on the platform and compiler options used. Note: this function is not used by htslib. It was only added for reasons of completeness when we were putting in hts_drand48, used by htslib/ksort.h. --- hts_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hts_os.c b/hts_os.c index 359392173..c26700975 100644 --- a/hts_os.c +++ b/hts_os.c @@ -49,7 +49,7 @@ HTSLIB_EXPORT double hts_drand48(void) { return drand48(); } HTSLIB_EXPORT -double hts_lrand48(void) { return lrand48(); } +long hts_lrand48(void) { return lrand48(); } #endif // // On Windows when using the MSYS or Cygwin terminals, isatty fails From 21b677e52ea98219c30f8a0c6bd3d973210d361f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 5 Nov 2020 14:52:48 +0000 Subject: [PATCH 014/114] Add Cirrus-CI integration (PR #1175) --- .cirrus.yml | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 .cirrus.yml diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..6b5ff0b98 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,166 @@ +# Note we have a maximum of 16 CPUs available, so adjust our +# builds so we can start all concurrently without needing to schedule. + +# Sadly though there is still a finite limit to macOS of one instance. +# Can we cull our Mac test to just one instance? + +timeout_in: 10m + +#-------------------------------------------------- +# Template: build libdeflate dependency + +libdeflate_template: &LIBDEFLATE + libdeflate_script: | + if test "x$USE_LIBDEFLATE" == "xyes"; then + pushd "$HOME" + git clone --depth 1 https://github.com/ebiggers/libdeflate.git + pushd libdeflate + make -j 4 CFLAGS='-fPIC -O3' libdeflate.a + popd + popd + fi + +#-------------------------------------------------- +# Template: compile and test + +compile_template: &COMPILE + compile_script: | + if test "x$USE_LIBDEFLATE" = "xyes"; then + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' + else + CONFIG_OPTS='--without-libdeflate' + fi + if test "$USE_CONFIG" = "yes"; then + MAKE_OPTS= + autoreconf + eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ + ( cat config.log; false ) + else + MAKE_OPTS=-e + fi + if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then + make maintainer-check + fi + make -j 4 $MAKE_OPTS + +test_template: &TEST + test_script: | + make test-shlib-exports + make test + + +#-------------------------------------------------- +# Task: linux builds. + +# Debian + latest GCC +gcc_task: + name: debian-gcc + container: + image: gcc:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + DO_MAINTAINER_CHECKS: yes + USE_CONFIG: no + - environment: + USE_CONFIG: yes + CFLAGS: -std=c99 -pedantic + USE_LIBDEFLATE: yes + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + + +# Ubuntu + Clang +ubuntu_task: + name: ubuntu-clang + container: + #image: ubuntu:latest # use << : *LIBDEFLATE + image: ubuntu:devel + cpu: 2 + memory: 1G + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: yes + - environment: + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -fsanitize=address + LDFLAGS: -fsanitize=address + USE_LIBDEFLATE: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev + + << : *COMPILE + << : *TEST + + +# CentOS +centos_task: + name: centos-gcc + container: + image: centos:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + USE_CONFIG: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ + bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ + diffutils + + << : *COMPILE + << : *TEST + +#-------------------------------------------------- +# Task: macOS builds + +macosx_task: + name: macosx + clang + osx_instance: + image: catalina-base + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + USE_LIBDEFLATE: yes + + package_install_script: + - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + From 0ad12d7593ada731e0ff20c8bcd027d31dfb98fa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Oct 2020 12:13:30 +0000 Subject: [PATCH 015/114] Fix assumption of pthread_t being a non-structure. Replaces #1154 See also #1153 It does this by the addition of a separate variable to do the boolean check on instead of fd->dispatcher itself. I deemed this easier to understand than overloading the interpretation of fd->h being set plus I'm unsure of the potential failure case in sam_hdr_fill_hrecs where fd->h has been set but we errored before creating the threads. Also improves error recovering in case of pthread creation failure, avoiding a false pthread_join later. Thanks to John Marshall and Anders Kaplan for identifying that issue. --- sam.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 49b1a0f3f..9127e1ea4 100644 --- a/sam.c +++ b/sam.c @@ -2377,6 +2377,7 @@ typedef struct SAM_state { pthread_mutex_t lines_m; hts_tpool_process *q; pthread_t dispatcher; + int dispatcher_set; sp_lines *lines; sp_bams *bams; @@ -2462,7 +2463,7 @@ int sam_state_destroy(htsFile *fp) { if (fd->q) hts_tpool_wake_dispatch(fd->q); // unstick the reader - if (!fp->is_write && fd->q && fd->dispatcher) { + if (!fp->is_write && fd->q && fd->dispatcher_set) { for (;;) { // Avoid deadlocks with dispatcher if (fd->command == SAM_CLOSE_DONE) @@ -2502,7 +2503,8 @@ int sam_state_destroy(htsFile *fp) { } // Wait for it to acknowledge - pthread_join(fd->dispatcher, NULL); + if (fd->dispatcher_set) + pthread_join(fd->dispatcher, NULL); if (!ret) ret = -fd->errcode; } @@ -3110,8 +3112,10 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) return -2; // We can only do this once we've got a header - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, fp) != 0) + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, + fp) != 0) return -2; + fd->dispatcher_set = 1; } if (fd->h != h) { @@ -3299,13 +3303,15 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) // destroy it later on and sam_hdr_destroy takes non-const. // // We do this because some tools do sam_hdr_destroy; sam_close - // while others do sam_close; sam_hdr_destroy. The former is an - // issue as we need the header still when flushing. + // while others do sam_close; sam_hdr_destroy. The former is + // an issue as we need the header still when flushing. fd->h = (sam_hdr_t *)h; fd->h->ref_count++; - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, fp) != 0) + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, + fp) != 0) return -2; + fd->dispatcher_set = 1; } if (fd->h != h) { From b710781df0d9bb6f665756316653554fe32e5c68 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Nov 2020 18:52:24 +0000 Subject: [PATCH 016/114] Fix memory leak on bad input in vcf_parse_info() Credit to OSS-Fuzz Fixes oss-fuzz 26968 --- vcf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vcf.c b/vcf.c index 89ee90c20..a1aa55525 100644 --- a/vcf.c +++ b/vcf.c @@ -2709,7 +2709,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; - return -1; + goto fail; } val = end = 0; c = *r; *r = 0; @@ -2736,7 +2736,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; - return -1; + goto fail; } } uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; @@ -2757,7 +2757,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? - return -1; + goto fail; } a_val = a_tmp; max_n_val = n_val; @@ -2854,6 +2854,10 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p free(a_val); return 0; + + fail: + free(a_val); + return -1; } int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) From 7f8211cd76a59ba870566a430a69287ee657dd4e Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 23 Oct 2020 15:06:47 +0100 Subject: [PATCH 017/114] Prevent GCC -Wextra switch/case statement "fall through" warnings Add comments indicating intentional fall-throughs. Adjust comment in textutils_internal.h to match GCC's -Wimplicit-fallthrough[=3] pattern. Annotate several functions that call exit() as HTS_NORETURN so the compiler knows they won't actually fall through in any switch statements they appear in. (Clang understands only not-yet-standard attributes and annotations, not comment text, so there's little point in catering to its -Wimplicit- fallthrough warning, which isn't included in -Wall/-Wextra in Clang.) --- cram/cram_codecs.c | 28 ++++++++++++++-------------- cram/rANS_static.c | 9 ++++++--- hts.c | 5 +++-- tabix.c | 6 ++++-- test/test_index.c | 2 +- textutils_internal.h | 2 +- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 9f339c723..80131b633 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -202,20 +202,20 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { } switch(nbits) { -// case 15: GET_BIT_MSB(block, val); -// case 14: GET_BIT_MSB(block, val); -// case 13: GET_BIT_MSB(block, val); -// case 12: GET_BIT_MSB(block, val); -// case 11: GET_BIT_MSB(block, val); -// case 10: GET_BIT_MSB(block, val); -// case 9: GET_BIT_MSB(block, val); - case 8: GET_BIT_MSB(block, val); - case 7: GET_BIT_MSB(block, val); - case 6: GET_BIT_MSB(block, val); - case 5: GET_BIT_MSB(block, val); - case 4: GET_BIT_MSB(block, val); - case 3: GET_BIT_MSB(block, val); - case 2: GET_BIT_MSB(block, val); +// case 15: GET_BIT_MSB(block, val); // fall through +// case 14: GET_BIT_MSB(block, val); // fall through +// case 13: GET_BIT_MSB(block, val); // fall through +// case 12: GET_BIT_MSB(block, val); // fall through +// case 11: GET_BIT_MSB(block, val); // fall through +// case 10: GET_BIT_MSB(block, val); // fall through +// case 9: GET_BIT_MSB(block, val); // fall through + case 8: GET_BIT_MSB(block, val); // fall through + case 7: GET_BIT_MSB(block, val); // fall through + case 6: GET_BIT_MSB(block, val); // fall through + case 5: GET_BIT_MSB(block, val); // fall through + case 4: GET_BIT_MSB(block, val); // fall through + case 3: GET_BIT_MSB(block, val); // fall through + case 2: GET_BIT_MSB(block, val); // fall through case 1: GET_BIT_MSB(block, val); break; diff --git a/cram/rANS_static.c b/cram/rANS_static.c index b58fe7189..584f8b561 100644 --- a/cram/rANS_static.c +++ b/cram/rANS_static.c @@ -157,9 +157,9 @@ unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, RansEncInit(&rans3); switch (i=(in_size&3)) { - case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); - case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); - case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); + case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); // fall through + case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); // fall through + case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); // fall through case 0: break; } @@ -348,10 +348,13 @@ unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, switch(out_sz&3) { case 3: out_buf[out_end+2] = D.R[RansDecGet(&R[2], TF_SHIFT)]; + // fall through case 2: out_buf[out_end+1] = D.R[RansDecGet(&R[1], TF_SHIFT)]; + // fall through case 1: out_buf[out_end] = D.R[RansDecGet(&R[0], TF_SHIFT)]; + // fall through default: break; } diff --git a/hts.c b/hts.c index 278ecd36a..1ab89400e 100644 --- a/hts.c +++ b/hts.c @@ -784,8 +784,8 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { // NB: Doesn't support floats, eg 1.5g // TODO: extend hts_parse_decimal? See also samtools sort. switch (*endp) { - case 'g': case 'G': o->val.i *= 1024; - case 'm': case 'M': o->val.i *= 1024; + case 'g': case 'G': o->val.i *= 1024; // fall through + case 'm': case 'M': o->val.i *= 1024; // fall through case 'k': case 'K': o->val.i *= 1024; break; case '\0': break; default: @@ -2731,6 +2731,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) switch (tid) { case HTS_IDX_NONE: iter->finished = 1; + // fall through case HTS_IDX_START: case HTS_IDX_REST: iter->curr_off = t_off; diff --git a/tabix.c b/tabix.c index 01cbf801c..3013aa550 100644 --- a/tabix.c +++ b/tabix.c @@ -52,7 +52,8 @@ typedef struct } args_t; -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error(const char *format, ...) { va_list ap; fflush(stdout); @@ -63,7 +64,8 @@ HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) exit(EXIT_FAILURE); } -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error_errno(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error_errno(const char *format, ...) { va_list ap; int eno = errno; diff --git a/test/test_index.c b/test/test_index.c index 402879666..cc90f51fe 100644 --- a/test/test_index.c +++ b/test/test_index.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/sam.h" #include "../htslib/vcf.h" -void usage(FILE *fp) { +void HTS_NORETURN usage(FILE *fp) { fprintf(fp, "Usage: test_index [opts] in.{sam.gz,bam,cram}|in.{vcf.gz,bcf}\n\n"); fprintf(fp, " -b Use BAI index (BAM, SAM)\n"); fprintf(fp, " -c Use CSI index (BAM, SAM, VCF, BCF)\n"); diff --git a/textutils_internal.h b/textutils_internal.h index 7e8628658..4b120bdbc 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -355,7 +355,7 @@ static inline double hts_str2dbl(const char *in, char **end, int *failed) { case '0': if (v[1] != 'x' && v[1] != 'X') break; - // else fall through (hex number) + // else fall through - hex number default: // Non numbers, like NaN, Inf From 9a55e4e74829593722b1a08a5c2cc414a57e1a25 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 27 Oct 2020 13:45:12 +0000 Subject: [PATCH 018/114] Extract the CIGAR parsing logic into separate methods for each of the two most common use cases: independent uint32_t array and bam1_t record. Add documentation. --- htslib/sam.h | 23 ++++++++ sam.c | 125 ++++++++++++++++++++++++++++++++++++----- test/mpileup/mp_N2.sam | 2 +- 3 files changed, 134 insertions(+), 16 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 8fcf819bf..d08646f13 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1103,6 +1103,29 @@ char *bam_flag2str(int flag); /** The string must be freed by the user */ HTSLIB_EXPORT int bam_set_qname(bam1_t *b, const char *qname); +/*! @function + @abstract Parse a CIGAR string into a uint32_t array + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param a_cigar [out] address of the destination uint32_t buffer + @param a_mem [in/out] address of the allocated number of buffer elements + @return number of processed CIGAR operators; 0 if error + */ +HTSLIB_EXPORT +size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); + +/*! @function + @abstract Parse a CIGAR string into a bam1_t struct + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param b [in/out] address of the destination bam1_t struct + @return number of processed CIGAR operators; 0 if error + */ +HTSLIB_EXPORT +size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); + /************************* *** BAM/CRAM indexing *** *************************/ diff --git a/sam.c b/sam.c index 9127e1ea4..90180be66 100644 --- a/sam.c +++ b/sam.c @@ -2148,22 +2148,13 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) if (*p++ != '\t') goto err_ret; // cigar if (*p != '*') { - uint32_t *cigar; - size_t n_cigar = 0; - for (q = p; *p && *p != '\t'; ++p) - if (!isdigit_c(*p)) ++n_cigar; - if (*p++ != '\t') goto err_ret; - _parse_err(n_cigar == 0, "no CIGAR operations"); - _parse_err(n_cigar >= 2147483647, "too many CIGAR operations"); + uint32_t *cigar = NULL; + int old_l_data = b->l_data; + uint32_t n_cigar = bam_parse_cigar(p, &p, b); + if (!n_cigar || *p++ != '\t') goto err_ret; + cigar = (uint32_t *)(b->data + old_l_data); c->n_cigar = n_cigar; - _get_mem(uint32_t, &cigar, b, c->n_cigar * sizeof(uint32_t)); - for (i = 0; i < c->n_cigar; ++i) { - int op; - cigar[i] = hts_str2uint(q, &q, 28, &overflow)<flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; if (cigreflen == 0) cigreflen = 1; @@ -2328,6 +2319,110 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) return -2; } +static uint32_t read_ncigar(const char *in) { + uint32_t n_cigar = 0; + char *q = (char *)in; + for (; *q && *q != '\t'; ++q) + if (!isdigit_c(*q)) ++n_cigar; + if (!n_cigar) { + hts_log_error("No CIGAR operations"); + return 0; + } + if (n_cigar >= 2147483647) { + hts_log_error("Too many CIGAR operations"); + return 0; + } + + return n_cigar; +} + +/*! @function + @abstract Parse a CIGAR string into preallocated a uint32_t array + @param in [in] pointer to the source string + @param a_cigar [out] address of the destination uint32_t buffer + @return number of processed input characters; 0 if error + */ +static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { + int i, overflow = 0; + char *p, *q = (char *)in; + for (i = 0; i < n_cigar; i++) { + uint32_t len; + int op; + p = q; + len = hts_str2uint(q, &q, 28, &overflow)< *a_mem) { + uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); + if (a_tmp) { + *a_cigar = a_tmp; + *a_mem = n_cigar; + } else { + hts_log_error("Memory allocation error"); + return 0; + } + } + + if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return 0; + if (end) *end = (char *)in+diff; + + return n_cigar; +} + +size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { + size_t n_cigar = 0; + int diff; + + if (!in || !b) { + hts_log_error("NULL pointer arguments"); + return 0; + } + if (end) *end = (char *)in; + + n_cigar = read_ncigar(in); + if (!n_cigar) return 0; + if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { + hts_log_error("Memory allocation error"); + return 0; + } + + if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0; + b->l_data += (n_cigar * sizeof(uint32_t)); + if (end) *end = (char *)in+diff; + + return n_cigar; +} + /* * ----------------------------------------------------------------------------- * SAM threading diff --git a/test/mpileup/mp_N2.sam b/test/mpileup/mp_N2.sam index 292cfcb58..8ea072db9 100644 --- a/test/mpileup/mp_N2.sam +++ b/test/mpileup/mp_N2.sam @@ -40,7 +40,7 @@ @CO sD1 0 z 1 0 4M2I5D2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sD2 0 z 1 0 4M1I1P5D1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL -sD3 0 z 1 0 4M1P1II5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL +sD3 0 z 1 0 4M1P1I5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL sN1 0 z 1 0 4M2I5N2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sN2 0 z 1 0 4M1I1P5N1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL sN3 0 z 1 0 4M1P1I5N1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL From 51275bcbda6d1e0849d0e50d0edd13814d38ebd1 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 10 Nov 2020 12:38:36 +0000 Subject: [PATCH 019/114] Make bam_itr_next an alias for sam_itr_next. --- htslib/sam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/sam.h b/htslib/sam.h index d08646f13..1ccad8776 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1135,7 +1135,7 @@ size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); #define bam_itr_destroy(iter) hts_itr_destroy(iter) #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) -#define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) +#define bam_itr_next(htsfp, itr, r) sam_itr_next((htsfp), (itr), (r)) // Load/build .csi or .bai BAM index file. Does not work with CRAM. // It is recommended to use the sam_index_* functions below instead. From 8c7eccfc8670f4ea23122eb10f70fc491ccc805f Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 25 Nov 2020 16:07:33 +0000 Subject: [PATCH 020/114] Adjust parser for undefined CIGAR. --- htslib/sam.h | 11 ++++++----- sam.c | 30 +++++++++++++++++++----------- test/sam.c | 23 +++++++++++++++++++++++ 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 1ccad8776..bfd7855a6 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include "hts.h" #include "hts_endian.h" @@ -1108,12 +1109,12 @@ int bam_set_qname(bam1_t *b, const char *qname); @param in [in] pointer to the source string @param end [out] address of the pointer to the new end of the input string can be NULL - @param a_cigar [out] address of the destination uint32_t buffer + @param a_cigar [in/out] address of the destination uint32_t buffer @param a_mem [in/out] address of the allocated number of buffer elements - @return number of processed CIGAR operators; 0 if error + @return number of processed CIGAR operators; -1 on error */ HTSLIB_EXPORT -size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); +ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem); /*! @function @abstract Parse a CIGAR string into a bam1_t struct @@ -1121,10 +1122,10 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t @param end [out] address of the pointer to the new end of the input string can be NULL @param b [in/out] address of the destination bam1_t struct - @return number of processed CIGAR operators; 0 if error + @return number of processed CIGAR operators; -1 on error */ HTSLIB_EXPORT -size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); /************************* *** BAM/CRAM indexing *** diff --git a/sam.c b/sam.c index 90180be66..467b270c3 100644 --- a/sam.c +++ b/sam.c @@ -2150,8 +2150,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) if (*p != '*') { uint32_t *cigar = NULL; int old_l_data = b->l_data; - uint32_t n_cigar = bam_parse_cigar(p, &p, b); - if (!n_cigar || *p++ != '\t') goto err_ret; + int n_cigar = bam_parse_cigar(p, &p, b); + if (n_cigar < 1 || *p++ != '\t') goto err_ret; cigar = (uint32_t *)(b->data + old_l_data); c->n_cigar = n_cigar; @@ -2340,7 +2340,7 @@ static uint32_t read_ncigar(const char *in) { @abstract Parse a CIGAR string into preallocated a uint32_t array @param in [in] pointer to the source string @param a_cigar [out] address of the destination uint32_t buffer - @return number of processed input characters; 0 if error + @return number of processed input characters; 0 on error */ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { int i, overflow = 0; @@ -2370,16 +2370,20 @@ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { return q-in; } -size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem) { +ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { size_t n_cigar = 0; int diff; if (!in || !a_cigar || !a_mem) { hts_log_error("NULL pointer arguments"); - return 0; + return -1; } if (end) *end = (char *)in; + if (*in == '*') { + if (end) (*end)++; + return 0; + } n_cigar = read_ncigar(in); if (!n_cigar) return 0; if (n_cigar > *a_mem) { @@ -2389,34 +2393,38 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem = n_cigar; } else { hts_log_error("Memory allocation error"); - return 0; + return -1; } } - if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return 0; + if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; if (end) *end = (char *)in+diff; return n_cigar; } -size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { size_t n_cigar = 0; int diff; if (!in || !b) { hts_log_error("NULL pointer arguments"); - return 0; + return -1; } if (end) *end = (char *)in; + if (*in == '*') { + if (end) (*end)++; + return 0; + } n_cigar = read_ncigar(in); if (!n_cigar) return 0; if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { hts_log_error("Memory allocation error"); - return 0; + return -1; } - if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0; + if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1; b->l_data += (n_cigar * sizeof(uint32_t)); if (end) *end = (char *)in+diff; diff --git a/test/sam.c b/test/sam.c index af1dc34da..b6f6c0e04 100644 --- a/test/sam.c +++ b/test/sam.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "../htslib/hts_defs.h" @@ -2149,6 +2150,27 @@ static void test_bam_set1_write_and_read_back() ks_free(&ks); } +static void test_cigar_api(void) +{ + uint32_t *buf = NULL; + char *cig = "*"; + char *end; + size_t m = 0; + int n; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 0 && m == 0 && (end-cig) == 1, "failed to parse undefined CIGAR"); + cig = "2M3X1I10M5D"; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 5 && m > 0 && (end-cig) == 11, "failed to parse CIGAR string: 2M3X1I10M5D"); + n = sam_parse_cigar("722M15D187217376188323783284M67I", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I"); + n = sam_parse_cigar("53I722MD8X", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with no op length: 53I722MD8X"); + +cleanup: + free(buf); +} + int main(int argc, char **argv) { int i; @@ -2186,6 +2208,7 @@ int main(int argc, char **argv) test_bam_set1_validate_cigar(); test_bam_set1_validate_size_limits(); test_bam_set1_write_and_read_back(); + test_cigar_api(); return status; } From 2056490488b81169bf69beb5ac835cd22dc74a42 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 26 Nov 2020 10:26:03 +0000 Subject: [PATCH 021/114] Recode to use const char pointers (cherry picked from commit a12d4d447cd3ca089db05dcc35144818339a4911) --- sam.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index 467b270c3..eb2712965 100644 --- a/sam.c +++ b/sam.c @@ -2319,9 +2319,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) return -2; } -static uint32_t read_ncigar(const char *in) { +static uint32_t read_ncigar(const char *q) { uint32_t n_cigar = 0; - char *q = (char *)in; for (; *q && *q != '\t'; ++q) if (!isdigit_c(*q)) ++n_cigar; if (!n_cigar) { @@ -2344,12 +2343,12 @@ static uint32_t read_ncigar(const char *in) { */ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { int i, overflow = 0; - char *p, *q = (char *)in; + const char *p = in; for (i = 0; i < n_cigar; i++) { uint32_t len; int op; - p = q; - len = hts_str2uint(q, &q, 28, &overflow)< Date: Tue, 1 Dec 2020 10:24:33 +0000 Subject: [PATCH 022/114] Fix the bgzf_idx_flush assertion. (PR #1168) The assumption is that we call bgzf_idx_flush once per outgoing multi-threaded block, and the block numbers will match. The assertion is to validate we're not indexing out of order. However, the test in the assertion was the wrong way round. Very long records (eg with huge aux tags or long seqs) can mean a single record spans multiple blocks and we inherently then skip blocks in bgzf_idx_push calls. What it actually needs to catch is cases where the blocks have been written out before the associated index entries have been added. Fixes samtools/samtools#1328 --- bgzf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bgzf.c b/bgzf.c index 09f18a643..ded3884e4 100644 --- a/bgzf.c +++ b/bgzf.c @@ -273,7 +273,7 @@ static int bgzf_idx_flush(BGZF *fp) { hts_idx_cache_entry *e = mt->idx_cache.e; int i; - assert(mt->idx_cache.nentries == 0 || mt->block_written >= e[0].block_number); + assert(mt->idx_cache.nentries == 0 || mt->block_written <= e[0].block_number); for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) { if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, From 7ad1ec2b4d1de9f04cfe827539446b6de8548a54 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 6 Dec 2020 20:12:16 +0000 Subject: [PATCH 023/114] Use $(ALL_CPPFLAGS), similarly to samtools's Makefile This extra level of indirection simplifies making additions to the CPPFLAGS used during compilation without interfering with the user's setting for $CPPFLAGS. Fix related typo in the MSYS/MinGW part of configure.ac. --- Makefile | 6 ++++-- config.mk.in | 2 +- configure.ac | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 245b7a16c..2adc21d25 100644 --- a/Makefile +++ b/Makefile @@ -103,6 +103,8 @@ BUILT_THRASH_PROGRAMS = \ all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ htslib_static.mk htslib-uninstalled.pc +ALL_CPPFLAGS = -I. $(CPPFLAGS) + HTSPREFIX = include htslib_vars.mk @@ -133,10 +135,10 @@ show-version: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ diff --git a/config.mk.in b/config.mk.in index 976e557df..f8decf0a2 100644 --- a/config.mk.in +++ b/config.mk.in @@ -94,7 +94,7 @@ pluginpath = @pluginpath@ LIBHTS_OBJS += plugin.o PLUGIN_OBJS += $(plugin_OBJS) -plugin.o plugin.pico: CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" +plugin.o plugin.pico: ALL_CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" # When built as separate plugins, these record their version themselves. hfile_gcs.o hfile_gcs.pico: version.h diff --git a/configure.ac b/configure.ac index ec137b75d..9bd1642d7 100644 --- a/configure.ac +++ b/configure.ac @@ -157,7 +157,7 @@ case $host_alias in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600" + CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; *) host_result="plain .so" From b581944da4015182a6a9a9bc325ee15b002cba61 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Mon, 7 Dec 2020 15:30:07 +0000 Subject: [PATCH 024/114] Set lines to be skipped independently. --- tabix.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tabix.c b/tabix.c index 3013aa550..36c414798 100644 --- a/tabix.c +++ b/tabix.c @@ -498,6 +498,7 @@ int main(int argc, char *argv[]) memset(&args,0,sizeof(args_t)); args.cache_megs = 10; args.download_index = 1; + int32_t new_line_skip = -1; static const struct option loptions[] = { @@ -570,7 +571,7 @@ int main(int argc, char *argv[]) detect = 0; break; case 'S': - conf.line_skip = strtol(optarg,&tmp,10); + new_line_skip = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: -S %s\n", optarg); detect = 0; break; @@ -605,6 +606,9 @@ int main(int argc, char *argv[]) } } + if (new_line_skip >= 0) + conf.line_skip = new_line_skip; + if ( optind==argc ) return usage(stderr, EXIT_FAILURE); if ( list_chroms ) From 7ca2b49fc9200171c3ffdb2a8c403c845a286e9d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 5 Nov 2020 11:26:09 +0000 Subject: [PATCH 025/114] Don't link plugins with libhts.dylib on macOS PR #1072 changed plugin linking so that plugins are linked back to the dynamic libhts.so/.dylib, to facilitate use when libhts is itself dynamically dlopen()ed with RTLD_LOCAL, e.g., by the Python runtime which uses default dlopen() flags which on Linux means RTLD_LOCAL. This broke plugin loading on macOS when opening plugins in an executable in which libhts.a has been statically linked, as there were then two copies of the library globals (notably hfile.c::schemes), one from the executable's libhts.a and one from the plugin's libhts.NN.dylib. (The Linux loading model does not suffer from this issue.) The default dlopen() flag on macOS is RTLD_GLOBAL, so this can be fixed by reverting the change (on macOS only) and depending on the symbols supplied by a static libhts.a, a dynamically linked libhts.NN.dylib, or a RTLD_GLOBALly dlopen()ed libhts.NN.dylib. This rebreaks the case of dlopen()ing libhts on macOS while explicitly specifying RTLD_LOCAL, but this is not a common case. Fixes #1176. Disable the `plugins-dlhts -l` test case on macOS. Add a test of accessing plugins from an executable with a statically linked libhts.a (namely, htsfile) to test/test.pl. --- Makefile | 12 ++++++++++-- test/plugins-dlhts.c | 18 +++++++++++++++--- test/test.pl | 21 +++++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2adc21d25..bd637eacc 100644 --- a/Makefile +++ b/Makefile @@ -309,11 +309,19 @@ hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) hts-object-files: $(LIBHTS_OBJS) touch $@ +# On Unix dlopen("libhts.so.NN", RTLD_LAZY) may default to RTLD_LOCAL. +# Hence plugins need to link to (shared) libhts.so.NN themselves, as they +# may not be able to access libhts symbols via the main program's libhts +# if that was dynamically loaded without an explicit RTLD_GLOBAL. %.so: %.pico libhts.so $(CC) -shared -Wl,-E $(LDFLAGS) -o $@ $< libhts.so $(LIBS) -lpthread -%.bundle: %.o libhts.dylib - $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< libhts.dylib $(LIBS) +# For programs *statically* linked to libhts.a, on macOS loading a plugin +# linked to a shared libhts.NN.dylib would lead to conflicting duplicate +# symbols. Fortunately macOS dlopen() defaults to RTLD_GLOBAL so there +# is less need for plugins to link back to libhts themselves. +%.bundle: %.o + $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< $(LIBS) %.cygdll: %.o libhts.dll.a $(CC) -shared $(LDFLAGS) -o $@ $< libhts.dll.a $(LIBS) diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index aa98ef3f5..f90e3bd74 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -101,6 +101,7 @@ void verbose_log(const char *message) int main(int argc, char **argv) { int dlflags = RTLD_NOW; + int skip = 0; int c; while ((c = getopt(argc, argv, "glv")) >= 0) @@ -133,15 +134,26 @@ int main(int argc, char **argv) hclose_abruptly_p = (hclose_abruptly_func *) func(htslib, "hclose_abruptly"); test_hopen("bad-scheme:unsupported", 0); + +#ifdef __APPLE__ + /* Skip -l tests as we don't link plugins back to libhts on macOS, as this + would conflict with a statically linked libhts.a on this platform. */ + skip = (dlflags & RTLD_LOCAL) != 0; +#endif + + if (! skip) { #ifdef HAVE_LIBCURL - test_hopen("https://localhost:99999/invalid_port", 1); + test_hopen("https://localhost:99999/invalid_port", 1); #endif #ifdef ENABLE_GCS - test_hopen("gs:invalid", 1); + test_hopen("gs:invalid", 1); #endif #ifdef ENABLE_S3 - test_hopen("s3:invalid", 1); + test_hopen("s3:invalid", 1); #endif + } + else + verbose_log("Skipping most tests"); verbose_log("Calling hts_lib_shutdown()"); (func(htslib, "hts_lib_shutdown"))(); diff --git a/test/test.pl b/test/test.pl index 1fb6112bd..5db7b2320 100755 --- a/test/test.pl +++ b/test/test.pl @@ -57,6 +57,7 @@ test_convert_padded_header($opts); test_rebgzip($opts); test_logging($opts); +test_plugin_loading($opts); test_realn($opts); print "\nNumber of tests:\n"; @@ -936,6 +937,26 @@ sub test_logging else { passed($opts,$test); } } +sub test_plugin_loading { + my ($opts) = @_; + + my $test = "test_plugin_loading"; + + unless (-e "$$opts{bin}/hfile_libcurl.so" || -e "$$opts{bin}/hfile_libcurl.bundle") { + print "$test: .. skipping\n\n"; + return; + } + + # Test that plugins can be loaded from an executable statically linked to libhts.a + my $url = "https://localhost:99999/invalid_port"; + my $cmd = "HTS_PATH=$$opts{bin} $$opts{path}/with-shlib.sh $$opts{bin}/htsfile $url"; + print "$test:\n\t$cmd\n"; + my ($ret, $out) = _cmd("$cmd 2>&1"); + if ($ret == 0) { failed($opts, $test, "successful exit status"); } + elsif ($out =~ /couldn't register/i || $out =~ /not supported/i) { failed($opts, $test, $out); } + else { passed($opts, $test); } +} + sub test_realn { my ($opts) = @_; From 246c146f3f46d184b1dc3877ca35b16d13ee220a Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Thu, 17 Dec 2020 21:48:04 +1100 Subject: [PATCH 026/114] docs: fix simple typo, seperated -> separated There is a small typo in htslib/vcf.h. Should read `separated` rather than `seperated`. --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 659ddc7c5..6e476187a 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -182,7 +182,7 @@ typedef struct bcf_dec_t { int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change int n_flt; // Number of FILTER fields int *flt; // FILTER keys in the dictionary - char *id, *als; // ID and REF+ALT block (\0-seperated) + char *id, *als; // ID and REF+ALT block (\0-separated) char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated bcf_info_t *info; // INFO bcf_fmt_t *fmt; // FORMAT and individual sample From 78441c964312aba387d50dbe144f0ff6975820ae Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 1 Jan 2021 10:37:22 +0000 Subject: [PATCH 027/114] Don't set $host_alias as that confuses autoconf 2.70 Autoconf 2.70 is more careful about cross compilation, so with this version using AC_FUNC_MMAP implies AC_CANONICAL_HOST and hence computes $build/build_alias/host/host_alias/etc. Setting $host_alias ourselves interferes with that. Hat tip Matthias Klose (via debbug#978835). As autoconf 2.70 implicitly uses AC_CANONICAL_HOST, it requires (and its autoreconf --install installs) config.guess and config.sub. Ignore those, and ignore install-sh as well for good measure. --- .gitignore | 3 +++ configure.ac | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 363698f1c..76be407b5 100644 --- a/.gitignore +++ b/.gitignore @@ -11,12 +11,15 @@ autom4te.cache config.cache +config.guess config.h config.h.in config.log config.mk config.status +config.sub configure +install-sh hfile_*.bundle hfile_*.cygdll diff --git a/configure.ac b/configure.ac index 9bd1642d7..f473c97e5 100644 --- a/configure.ac +++ b/configure.ac @@ -137,9 +137,9 @@ AC_ARG_ENABLE([s3], [support Amazon AWS S3 URLs])], [], [enable_s3=check]) -test -n "$host_alias" || host_alias=unknown-`uname -s` -AC_MSG_CHECKING([shared library type for $host_alias]) -case $host_alias in +basic_host=${host_alias:-unknown-`uname -s`} +AC_MSG_CHECKING([shared library type for $basic_host]) +case $basic_host in *-cygwin* | *-CYGWIN*) host_result="Cygwin DLL" PLATFORM=CYGWIN From 44787d9800bba41e463e4db11c2280fac3cdbb7f Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Fri, 23 Oct 2020 21:00:41 +0200 Subject: [PATCH 028/114] Replaced bam_construct_seq() with bam_set1(). --- Makefile | 2 - cram/cram_decode.c | 41 +++++++-------- cram/cram_samtools.c | 123 ------------------------------------------- cram/cram_samtools.h | 23 -------- htslib.mk | 1 - 5 files changed, 18 insertions(+), 172 deletions(-) delete mode 100644 cram/cram_samtools.c diff --git a/Makefile b/Makefile index bd637eacc..e189ecb28 100644 --- a/Makefile +++ b/Makefile @@ -174,7 +174,6 @@ LIBHTS_OBJS = \ cram/cram_external.o \ cram/cram_index.o \ cram/cram_io.o \ - cram/cram_samtools.o \ cram/cram_stats.o \ cram/mFILE.o \ cram/open_trace_file.o \ @@ -367,7 +366,6 @@ cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) -cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) $(sam_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 6aeb0aa37..ad09fb757 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2839,10 +2839,10 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, */ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cram_record *cr, int rec, bam_seq_t **bam) { - int bam_idx, rg_len; + int ret, rg_len; char name_a[1024], *name; int name_len; - char *aux, *aux_orig; + char *aux; char *seq, *qual; sam_hrecs_t *bfd = sh->hrecs; @@ -2887,7 +2887,6 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cr->len = 0; } - if (fd->required_fields & SAM_QUAL) { if (!BLOCK_DATA(s->qual_blk)) return -1; @@ -2896,41 +2895,37 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, qual = NULL; } - bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, - name, name_len, - cr->flags, - cr->ref_id, - cr->apos, - cr->aend, - cr->mqual, - cr->ncigar, &s->cigar[cr->cigar], - cr->mate_ref_id, - cr->mate_pos, - cr->tlen, - cr->len, - seq, - qual); - if (bam_idx == -1) - return -1; + ret = bam_set1(*bam, + name_len, name, + cr->flags, cr->ref_id, cr->apos - 1, cr->mqual, + cr->ncigar, &s->cigar[cr->cigar], + cr->mate_ref_id, cr->mate_pos - 1, cr->tlen, + cr->len, seq, qual, + cr->aux_size + rg_len); + if (ret < 0) { + return ret; + } - aux = aux_orig = (char *)bam_aux(*bam); + aux = (char *)bam_aux(*bam); /* Auxiliary strings */ if (cr->aux_size != 0) { memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); aux += cr->aux_size; + (*bam)->l_data += cr->aux_size; } /* RG:Z: */ - if (cr->rg != -1) { - int len = bfd->rg[cr->rg].name_len; + if (rg_len > 0) { *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; + int len = bfd->rg[cr->rg].name_len; memcpy(aux, bfd->rg[cr->rg].name, len); aux += len; *aux++ = 0; + (*bam)->l_data += rg_len; } - return bam_idx + (aux - aux_orig); + return (*bam)->l_data; } /* diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c deleted file mode 100644 index 890a7fad4..000000000 --- a/cram/cram_samtools.c +++ /dev/null @@ -1,123 +0,0 @@ -/* -Copyright (c) 2010-2013, 2017-2019 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include - -#include "cram.h" -#include "../htslib/sam.h" -#include "../sam_internal.h" - -/*--------------------------------------------------------------------------- - * Samtools compatibility portion - */ -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual) { - static const char L[256] = { - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 - }; - bam1_t *b = (bam1_t *)*bp; - uint8_t *cp; - int i, qname_nuls, bam_len; - - //b->l_aux = extra_len; // we fill this out later - - qname_nuls = 4 - qname_len%4; - bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; - if (realloc_bam_data(b, bam_len) < 0) - return -1; - b->l_data = bam_len; - - b->core.tid = rname; - b->core.pos = pos-1; - b->core.bin = bam_reg2bin(pos-1, end); - b->core.qual = mapq; - b->core.l_qname = qname_len+qname_nuls; - b->core.l_extranul = qname_nuls-1; - b->core.flag = flag; - b->core.n_cigar = ncigar; - b->core.l_qseq = len; - b->core.mtid = mrnm; - b->core.mpos = mpos-1; - b->core.isize = isize; - - cp = b->data; - - strncpy((char *)cp, qname, qname_len); - for (i = 0; i < qname_nuls; i++) - cp[qname_len+i] = '\0'; - cp += qname_len+qname_nuls; - if (ncigar > 0) memcpy(cp, cigar, ncigar*4); - cp += ncigar*4; - - for (i = 0; i+1 < len; i+=2) { - *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]]; - } - if (i < len) - *cp++ = L[(uc)seq[i]]<<4; - - if (qual) - memcpy(cp, qual, len); - else - memset(cp, '\xff', len); - - return bam_len; -} diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 115a96550..34c1db40e 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -72,27 +72,4 @@ enum cigar_op { typedef bam1_t bam_seq_t; -#ifdef __cplusplus -extern "C" { -#endif - -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual); - -#ifdef __cplusplus -} -#endif - #endif /* CRAM_SAMTOOLS_H */ diff --git a/htslib.mk b/htslib.mk index b750869c8..ceb9bf3c4 100644 --- a/htslib.mk +++ b/htslib.mk @@ -127,7 +127,6 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/cram_index.h \ $(HTSDIR)/cram/cram_io.c \ $(HTSDIR)/cram/cram_io.h \ - $(HTSDIR)/cram/cram_samtools.c \ $(HTSDIR)/cram/cram_samtools.h \ $(HTSDIR)/cram/cram_stats.c \ $(HTSDIR)/cram/cram_stats.h \ From 999d1819e0b4083e6fe3339188a41cca7be1c508 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 23 Nov 2020 12:22:21 +0000 Subject: [PATCH 029/114] Add a general SAM/BAM/CRAM input filter option (HTS_OPT_SAM_FILTER). This uses an expression language and SAM format fields to specify rules which must pass (so filter-in rather than filter-out). The code structure is expr.[ch] implements a generic expression language which accepts a function callback for performing variable lookup, with sam.c:bam_sym_lookup() being that function. The sam_read1 function has been changed to keep looping until the filter passes, or we run out of data. This works for all file formats, but does so by adding a new member to htsFile. Technically it's an ABI change, but at least our own usage of it never uses arrays of them and it's stated in the header to be an opaque type which in theory should also ban that, albeit without any compiler checks). If this is a problem, would could add the same member to BGZF, cram_fd and hFILE as they're all private, but that feels like needless hoop jumping. Syntax supports Grouping: (, ), eg "(1+2)*3" Values: integers, floats, strings or variables Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) Math ops: *, /, % [TODO: add // for floor division?] Math ops: +, - Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] Conditionals: >, >=, <, <=, Equality: ==, !=, =~ !~ Boolean: &&, || Data types are numerics (integers or doubles, eg 10, -0x2B, 10.2e-7) and strings "str" with backslash as an escape char within strings. The key-words currently supported are: flag int flag.paired int flag.proper_pair int flag.unmap int flag.munmap int flag.reverse int flag.mreverse int flag.read1 int flag.read2 int flag.secondary int flag.qcfail int flag.dup int flag.supplementary int mapq int mpos / pnext int mrefid int mrname / rnext string ncigar int pos int qlen int qname string refid int rlen int rname string tlen int [XX] tag int / string For example, to return only records where the mate reference name differs to the mapped query name and the read is aligned with high mapping quality: test_view -i 'sam_filter=mqual >= 30 && mrname != rname' in.bam Filters to find significant deletions: cigar =~ "[0-9]{2}D" rlen - qlen > 10 Duplicates that aren't part of a "proper pair" (equivalent to "samtools view -F 2 -f 0x400") sam_filter=flag.dup && !flag.proper_pair --- Makefile | 10 +- configure.ac | 3 + expr.c | 582 ++++++++++++++++++++++++++++++++++++ expr.h | 64 ++++ hts.c | 26 ++ htslib/hts.h | 12 + sam.c | 289 +++++++++++++++++- test/sam_filter/filter.sh | 34 +++ test/sam_filter/filter.tst | 46 +++ test/sam_filter/int1.out | 1 + test/sam_filter/int2.out | 1 + test/sam_filter/int3.out | 1 + test/sam_filter/string1.out | 6 + test/sam_filter/string2.out | 6 + test/sam_filter/string3.out | 7 + test/sam_filter/string4.out | 8 + test/test_expr.c | 233 +++++++++++++++ 17 files changed, 1313 insertions(+), 16 deletions(-) create mode 100644 expr.c create mode 100644 expr.h create mode 100755 test/sam_filter/filter.sh create mode 100644 test/sam_filter/filter.tst create mode 100644 test/sam_filter/int1.out create mode 100644 test/sam_filter/int2.out create mode 100644 test/sam_filter/int3.out create mode 100644 test/sam_filter/string1.out create mode 100644 test/sam_filter/string2.out create mode 100644 test/sam_filter/string3.out create mode 100644 test/sam_filter/string4.out create mode 100644 test/test_expr.c diff --git a/Makefile b/Makefile index e189ecb28..bd091ae36 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ BUILT_TEST_PROGRAMS = \ test/plugins-dlhts \ test/sam \ test/test_bgzf \ + test/test_expr \ test/test_kfunc \ test/test_kstring \ test/test_realn \ @@ -148,6 +149,7 @@ LIBHTS_OBJS = \ bcf_sr_sort.o \ bgzf.o \ errmod.o \ + expr.o \ faidx.o \ header.o \ hfile.o \ @@ -331,6 +333,7 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) +expr.o expr.pico: expr.c expr.h config.h $(htslib_kstring_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) @@ -340,7 +343,7 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h expr.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) @@ -402,6 +405,7 @@ maintainer-check: # MSYS2_ARG_CONV_EXCL="*" make check check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) test/hts_endian + test/test_expr test/test_kfunc test/test_kstring test/test_str2int @@ -411,6 +415,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR) test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam + cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq @@ -441,6 +446,9 @@ test/sam: test/sam.o libhts.a test/test_bgzf: test/test_bgzf.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread +test/test_expr: test/test_expr.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread + test/test_kfunc: test/test_kfunc.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread diff --git a/configure.ac b/configure.ac index f473c97e5..e5e64496b 100644 --- a/configure.ac +++ b/configure.ac @@ -399,6 +399,9 @@ dnl Only need to add to static_LIBS if not building as a plugin fi fi +dnl Look for regcomp in various libraries (needed on windows/mingw). +AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) + dnl Look for PTHREAD_MUTEX_RECURSIVE. dnl This is normally in pthread.h except on some broken glibc implementations. AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) diff --git a/expr.c b/expr.c new file mode 100644 index 000000000..73df14dd1 --- /dev/null +++ b/expr.c @@ -0,0 +1,582 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +// TODO: +// - add maths functions. pow, sqrt, log, min, max, ? +// - ?: operator for conditionals? + +#include + +#include +#include +#include +#include +#include +#include +#include // may need configure rule for this + +#include "expr.h" +#include "textutils_internal.h" + +// Could also cache fexpr_t stack here for kstring reuse? +#define MAX_REGEX 10 +struct sam_filter_t { + char *str; + int parsed; + int curr_regex, max_regex; + regex_t preg[MAX_REGEX]; +}; + +/* + * This is designed to be mostly C like with mostly same the precedence rules, + * with the exception of bit operators (widely considered as a mistake in C). + * It's not full C (eg no bit-shifting), but good enough for our purposes. + * + * Supported syntax, in order of precedence: + * + * Grouping: (, ), eg "(1+2)*3" + * Values: integers, floats, strings or variables + * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) + * Math ops: *, /, % [TODO: add // for floor division?] + * Math ops: +, - + * Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] + * Conditionals: >, >=, <, <=, + * Equality: ==, !=, =~ !~ + * Boolean: &&, || + */ + +// Skip to start of term +static char *ws(char *str) { + while (*str && (*str == ' ' || *str == '\t')) + str++; + return str; +} + +static int expression(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res); + +/* + * simple_expr + * : identifier + * | constant + * // | string ? + * | '(' expression ')' +*/ +static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + // Main recursion step + str = ws(str); + if (*str == '(') { + if (expression(filt, data, fn, str+1, end, res)) return -1; + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; + } + + // Otherwise a basic element. + int fail = 0; + double d = hts_str2dbl(str, end, &fail); + if (str != *end) { + res->is_str = 0; + res->d = d; + } else { + // Not valid floating point syntax. + // TODO: add function call names in here; len(), sqrt(), pow(), etc + if (*str == '"') { + res->is_str = 1; + char *e = str+1; + int backslash = 0; + while (*e && *e != '"') { + if (*e == '\\') + backslash=1, e+=1+(e[1]!='\0'); + else + e++; + } + + kputsn(str+1, e-(str+1), ks_clear(&res->s)); + if (backslash) { + size_t i, j; + for (i = j = 0; i < res->s.l; i++) { + res->s.s[j++] = res->s.s[i]; + if (res->s.s[i] == '\\') { + switch (res->s.s[++i]) { + case '"': res->s.s[j-1] = '"'; break; + case '\\':res->s.s[j-1] = '\\'; break; + case 't': res->s.s[j-1] = '\t'; break; + case 'n': res->s.s[j-1] = '\n'; break; + case 'r': res->s.s[j-1] = '\r'; break; + default: res->s.s[j++] = res->s.s[i]; + } + } + } + res->s.s[j] = 0; + res->s.l = j; + } + if (*e != '"') + return -1; + *end = e+1; + } else if (fn) + // Look up variable. + return fn(data, str, end, res); + else + return -1; + } + + return 0; +} + +/* + * unary_expr + * : simple_expr + * | '+' simple_expr + * | '-' simple_expr + * | '!' unary_expr // higher precedence + * | '~' unary_expr // higher precedence + */ +static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + int err; + str = ws(str); + if (*str == '+') { + err = simple_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->is_true = res->d != 0; + } else if (*str == '-') { + err = simple_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->d = -res->d; + res->is_true = res->d != 0; + } else if (*str == '!') { + err = unary_expr(filt, data, fn, str+1, end, res); + if (res->is_str) { + res->is_str = 0; + res->d = 0; + res->is_true = !res->is_true; + } else { + res->d = !(int64_t)res->d; + res->is_true = res->d != 0; + } + } else if (*str == '~') { + err = unary_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->d = ~(int64_t)res->d; + res->is_true = res->d != 0; + } else { + err = simple_expr(filt, data, fn, str, end, res); + } + return err ? -1 : 0; +} + + +/* + * mul_expr + * : unary_expr ( + * unary_expr '*' unary_expr + * | unary_expr '/' unary_expr + * | unary_expr '%' unary_expr + * )* + */ +static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (unary_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + fexpr_t val = FEXPR_INIT; + while (*str) { + str = ws(str); + if (*str == '*' || *str == '/' || *str == '%') { + if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; + if (val.is_str || res->is_str) { + fexpr_free(&val); + return -1; // arith on strings + } + } + + if (*str == '*') + res->d *= val.d; + else if (*str == '/') + res->d /= val.d; + else if (*str == '%') + res->d = (int64_t)res->d % (int64_t)val.d; + else + break; + + str = *end; + } + fexpr_free(&val); + + return 0; +} + +/* + * add_expr + * : mul_expr ( + * mul_expr '+' mul_expr + * | mul_expr '-' mul_expr + * )* + */ +static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (mul_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + fexpr_t val = FEXPR_INIT; + while (*str) { + str = ws(str); + if (*str == '+' || *str == '-') { + if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; + if (val.is_str || res->is_str) { + fexpr_free(&val); + return -1; // arith on strings + } + } + + if (*str == '+') + res->d += val.d; + else if (*str == '-') + res->d -= val.d; + else + break; + + str = *end; + } + fexpr_free(&val); + + return 0; +} + +/* + * bitand_expr + * : add_expr + * | bitand_expr '&' add_expr + */ +static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (add_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '&' && str[1] != '&') { + if (add_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * bitxor_expr + * : bitand_expr + * | bitxor_expr '^' bitand_expr + */ +static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitand_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '^') { + if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * bitor_expr + * : xor_expr + * | bitor_expr '|' xor_expr + */ +static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitxor_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '|' && str[1] != '|') { + if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * cmp_expr + * : bitor_expr + * | cmp_expr '<=' bitor_expr + * | cmp_expr '<' bitor_expr + * | cmp_expr '>=' bitor_expr + * | cmp_expr '>' bitor_expr + */ +static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitor_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + fexpr_t val = FEXPR_INIT; + int err = 0; + + if (*str == '>' && str[1] == '=') { + err = cmp_expr(filt, data, fn, str+2, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) >= 0 + : !res->is_str && !val.is_str && res->d >= val.d; + res->is_str = 0; + } else if (*str == '>') { + err = cmp_expr(filt, data, fn, str+1, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) > 0 + : !res->is_str && !val.is_str && res->d > val.d; + res->is_str = 0; + } else if (*str == '<' && str[1] == '=') { + err = cmp_expr(filt, data, fn, str+2, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) <= 0 + : !res->is_str && !val.is_str && res->d <= val.d; + res->is_str = 0; + } else if (*str == '<') { + err = cmp_expr(filt, data, fn, str+1, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) < 0 + : !res->is_str && !val.is_str && res->d < val.d; + res->is_str = 0; + } + fexpr_free(&val); + + return err ? -1 : 0; +} + +/* + * eq_expr + * : cmp_expr + * | eq_expr '==' cmp_expr + * | eq_expr '!=' cmp_expr + * | eq_expr '=~' cmp_expr + * | eq_expr '!~' cmp_expr + */ +static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (cmp_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + + int err = 0; + fexpr_t val = FEXPR_INIT; + + // numeric vs numeric comparison is as expected + // string vs string comparison is as expected + // numeric vs string is false + if (str[0] == '=' && str[1] == '=') { + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0) + : !res->is_str && !val.is_str && res->d == val.d; + } + res->is_str = 0; + + } else if (str[0] == '!' && str[1] == '=') { + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1) + : res->is_str != val.is_str || res->d != val.d; + } + res->is_str = 0; + + } else if ((str[0] == '=' && str[1] == '~') || + (str[0] == '!' && str[1] == '~')) { + err = eq_expr(filt, data, fn, str+2, end, &val); + if (!val.is_str || !res->is_str) { + fexpr_free(&val); + return -1; + } + if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { + regex_t preg_, *preg; + if (filt->curr_regex >= filt->max_regex) { + // Compile regex if not seen before + if (filt->curr_regex >= MAX_REGEX) { + preg = &preg_; + } else { + preg = &filt->preg[filt->curr_regex]; + filt->max_regex++; + } + + int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB); + if (ec != 0) { + char errbuf[1024]; + regerror(ec, preg, errbuf, 1024); + fprintf(stderr, "Failed regex: %.1024s\n", errbuf); + fexpr_free(&val); + return -1; + } + } else { + preg = &filt->preg[filt->curr_regex]; + } + res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0 + ? *str == '=' // matcn + : *str == '!'; // no-match + if (preg == &preg_) + regfree(preg); + + filt->curr_regex++; + } else { + // nul regexp or input is considered false + res->is_true = 0; + } + res->is_str = 0; + } + fexpr_free(&val); + + return err ? -1 : 0; +} + +/* + * and_expr + * : eq_expr + * | and_expr 'and' eq_expr + * | and_expr 'or' eq_expr + */ +static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (eq_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (str[0] == '&' && str[1] == '&') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + res->is_true = res->d = + (res->is_true || (res->is_str && res->s.s) || res->d) && + (val.is_true || (val.is_str && val.s.s) || val.d); + res->is_str = 0; + } else if (str[0] == '|' && str[1] == '|') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + res->is_true = res->d = + res->is_true || (res->is_str && res->s.s) || res->d || + val.is_true || (val.is_str && val.s.s ) || val.d; + res->is_str = 0; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +static int expression(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + return and_expr(filt, data, fn, str, end, res); +} + +sam_filter_t *sam_filter_init(const char *str) { + sam_filter_t *f = calloc(1, sizeof(*f)); + if (!f) return NULL; + + // Oversize to permit faster comparisons with memcmp over strcmp + size_t len = strlen(str)+100; + if (!(f->str = malloc(len))) { + free(f); + return NULL; + } + strcpy(f->str, str); + return f; +} + +void sam_filter_free(sam_filter_t *filt) { + if (!filt) + return; + + int i; + for (i = 0; i < filt->max_regex; i++) + regfree(&filt->preg[i]); + + free(filt->str); + free(filt); +} + +int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *fn, + fexpr_t *res) { + char *end = NULL; + + memset(res, 0, sizeof(*res)); + + filt->curr_regex = 0; + if (expression(filt, data, fn, filt->str, &end, res)) + return -1; + + if (end && *ws(end)) { + fprintf(stderr, "Unable to parse expression at %s\n", filt->str); + return -1; + } + + // Strings evaluate to true. An empty string is also true, but an + // absent (null) string is false. An empty string has kstring length + // of zero, but a pointer as it's nul-terminated. + if (res->is_str) + res->is_true = res->d = res->s.s != NULL; + else + res->is_true |= res->d != 0; + + return 0; +} diff --git a/expr.h b/expr.h new file mode 100644 index 000000000..c4674fc41 --- /dev/null +++ b/expr.h @@ -0,0 +1,64 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTS_EXPR_H +#define HTS_EXPR_H + +#include +#include + +// fexpr_t is our return type and the type for elements within the expr. +// Note we cope with zero-but-true in order to implement a basic +// "exists(something)" check where "something" may even be zero. +// +// Eg in the aux tag searching syntax, "[NM]" should return true if +// NM tag exists even if zero. +// Take care when negating this. "[NM] != 0" will be true when +// [NM] is absent, thus consider "[NM] && [NM] != 0". +typedef struct { + char is_str; // Use .s vs .d + char is_true; // Force true if even zero + kstring_t s; // is_str and empty s permitted (eval as false) + double d; // otherwise this +} fexpr_t; + +#define FEXPR_INIT {0, 0, KS_INITIALIZE, 0} + +// Create a SAM filter for expression "str". +// +// Returns a pointer on success, +// NULL on failure +sam_filter_t *sam_filter_init(const char *str); + +// Frees a sam_filter_t created via sam_filter_init +void sam_filter_free(sam_filter_t *filt); + +typedef int (sym_func)(void *data, char *str, char **end, fexpr_t *res); +int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *f, fexpr_t *res); + +static inline void fexpr_free(fexpr_t *f) { + ks_free(&f->s); +} + +#endif /* HTS_EXPR_H */ diff --git a/hts.c b/hts.c index 1ab89400e..519b5b809 100644 --- a/hts.c +++ b/hts.c @@ -50,6 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" +#include "expr.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" @@ -824,6 +825,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "LEVEL") == 0) o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); + else if (strcmp(o->arg, "sam_filter") == 0 || + strcmp(o->arg, "SAM_FILTER") == 0) + o->opt = HTS_OPT_SAM_FILTER, o->val.s = val; + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -863,6 +868,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: + case HTS_OPT_SAM_FILTER: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1231,6 +1237,7 @@ int hts_close(htsFile *fp) save = errno; sam_hdr_destroy(fp->bam_header); hts_idx_destroy(fp->idx); + sam_filter_free(fp->filter); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -1335,6 +1342,13 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { fp->fp.bgzf->compress_level = level; } + case HTS_OPT_SAM_FILTER: { + va_start(args, opt); + char *expr = va_arg(args, char *); + va_end(args); + return hts_set_filter_expression(fp, expr); + } + default: break; } @@ -1396,6 +1410,18 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) return 0; } +int hts_set_filter_expression(htsFile *fp, const char *expr) +{ + if (fp->filter) + sam_filter_free(fp->filter); + + if (!expr) + return 0; + + return (fp->filter = sam_filter_init(expr)) + ? 0 : -1; +} + hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname) { int pid = (int) getpid(); diff --git a/htslib/hts.h b/htslib/hts.h index 7a85ca38c..9ccbb986b 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -224,6 +224,7 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; +typedef struct sam_filter_t sam_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -256,6 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; + sam_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. @@ -321,6 +323,7 @@ enum hts_fmt_option { HTS_OPT_THREAD_POOL, HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, + HTS_OPT_SAM_FILTER, }; // For backwards compatibility @@ -607,6 +610,15 @@ HTSLIB_EXPORT int hts_set_fai_filename(htsFile *fp, const char *fn_aux); +/*! + @abstract Sets a filter expression + @return 0 for success, negative on failure + @discussion + To clear an existing filter, specifying expr as NULL. +*/ +HTSLIB_EXPORT +int hts_set_filter_expression(htsFile *fp, const char *expr); + /*! @abstract Determine whether a given htsFile contains a valid EOF block @return 3 for a non-EOF checkable filetype; diff --git a/sam.c b/sam.c index eb2712965..93a9a994a 100644 --- a/sam.c +++ b/sam.c @@ -50,6 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "header.h" +#include "expr.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) @@ -3154,40 +3155,283 @@ int sam_set_threads(htsFile *fp, int nthreads) { return 0; } +// Bam record pointer and SAM header combined +typedef struct { + const sam_hdr_t *h; + const bam1_t *b; +} hb_pair; + +// Looks up variable names in str and replaces them with their value. +// Also supports aux tags. +// +// Note the expression parser deliberately overallocates str size so it +// is safe to use memcmp over strcmp. +static int bam_sym_lookup(void *data, char *str, char **end, fexpr_t *res) { + hb_pair *hb = (hb_pair *)data; + const bam1_t *b = hb->b; + + res->is_str = 0; + switch(*str) { + case 'c': + if (memcmp(str, "cigar", 5) == 0) { + *end = str+5; + res->is_str = 1; + ks_clear(&res->s); + uint32_t *cigar = bam_get_cigar(b); + int i, n = b->core.n_cigar, r = 0; + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); + } + kputs("", &res->s); + return r ? 0 : -1; + } + break; + + case 'f': + if (memcmp(str, "flag", 4) == 0) { + str = *end = str+4; + if (*str != '.') { + res->d = b->core.flag; + return 0; + } else { + str++; + if (!memcmp(str, "paired", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FPAIRED; + return 0; + } else if (!memcmp(str, "proper_pair", 11)) { + *end = str+11; + res->d = b->core.flag & BAM_FPROPER_PAIR; + return 0; + } else if (!memcmp(str, "unmap", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FUNMAP; + return 0; + } else if (!memcmp(str, "munmap", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FMUNMAP; + return 0; + } else if (!memcmp(str, "reverse", 7)) { + *end = str+7; + res->d = b->core.flag & BAM_FREVERSE; + return 0; + } else if (!memcmp(str, "mreverse", 8)) { + *end = str+8; + res->d = b->core.flag & BAM_FMREVERSE; + return 0; + } else if (!memcmp(str, "read1", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD1; + return 0; + } else if (!memcmp(str, "read2", 6)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD2; + return 0; + } else if (!memcmp(str, "secondary", 9)) { + *end = str+9; + res->d = b->core.flag & BAM_FSECONDARY; + return 0; + } else if (!memcmp(str, "qcfail", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FQCFAIL; + return 0; + } else if (!memcmp(str, "dup", 3)) { + *end = str+3; + res->d = b->core.flag & BAM_FDUP; + return 0; + } else if (!memcmp(str, "supplementary", 13)) { + *end = str+13; + res->d = b->core.flag & BAM_FSUPPLEMENTARY; + return 0; + } else { + hts_log_error("Unrecognised flag string"); + return -1; + } + } + } + break; + + case 'm': + if (memcmp(str, "mapq", 4) == 0) { + *end = str+4; + res->d = b->core.qual; + return 0; + } else if (memcmp(str, "mpos", 4) == 0) { + *end = str+4; + res->d = b->core.mpos+1; + return 0; + } else if (memcmp(str, "mrname", 6) == 0) { + *end = str+6; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "mrefid", 6) == 0) { + *end = str+6; + res->d = b->core.mtid; + return 0; + } + break; + + case 'n': + if (memcmp(str, "ncigar", 6) == 0) { + *end = str+6; + res->d = b->core.n_cigar; + return 0; + } + break; + + case 'p': + if (memcmp(str, "pos", 3) == 0) { + *end = str+3; + res->d = b->core.pos+1; + return 0; + } else if (memcmp(str, "pnext", 5) == 0) { + *end = str+5; + res->d = b->core.mpos+1; + return 0; + } + break; + + case 'q': + if (memcmp(str, "qlen", 4) == 0) { + *end = str+4; + res->d = b->core.l_qseq; + return 0; + } else if (memcmp(str, "qname", 5) == 0) { + *end = str+5; + res->is_str = 1; + kputs(bam_get_qname(b), ks_clear(&res->s)); + return 0; + } + break; + + case 'r': + if (memcmp(str, "rlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "rname", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "rnext", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "refid", 5) == 0) { + *end = str+5; + res->d = b->core.tid; + return 0; + } + break; + + case 't': + if (memcmp(str, "tlen", 4) == 0) { + *end = str+4; + res->d = b->core.isize; + return 0; + } + break; + + case '[': + if (*str == '[' && str[1] && str[2] && str[3] == ']') { + /* aux tags */ + *end = str+4; + + uint8_t *aux = bam_aux_get(b, str+1); + if (aux) { + // we define the truth of a tag to be its presence, even if 0. + res->is_true = 1; + switch (*aux) { + case 'Z': + case 'H': + res->is_str = 1; + kputs((char *)aux+1, ks_clear(&res->s)); + break; + + case 'A': + res->is_str = 1; + kputsn((char *)aux+1, 1, ks_clear(&res->s)); + break; + + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + res->is_str = 0; + res->d = bam_aux2i(aux); + break; + + case 'f': + case 'd': + res->is_str = 0; + res->d = bam_aux2f(aux); + break; + + default: + hts_log_error("Aux type '%c not yet supported by filters", + *aux); + return -1; + } + return 0; + + } else { + // hence absent tags are always false (and strings) + res->is_str = 1; + res->s.l = 0; + res->d = 0; + res->is_true = 0; + return 0; + } + } + break; + } + + // All successful matches in switch should return 0. + // So if we didn't match, it's a parse error. + return -1; +} + // Returns 0 on success, // -1 on EOF, // <-1 on error int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret; + + filtered: switch (fp->format.format) { - case bam: { - int r = bam_read1(fp->fp.bgzf, b); - if (h && r >= 0) { + case bam: + ret = bam_read1(fp->fp.bgzf, b); + if (h && ret >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || b->core.mtid >= h->n_targets || b->core.mtid < -1) { errno = ERANGE; return -3; } } - return r; - } + break; - case cram: { - int ret = cram_get_bam_seq(fp->fp.cram, &b); + case cram: + ret = cram_get_bam_seq(fp->fp.cram, &b); if (ret < 0) return cram_eof(fp->fp.cram) ? -1 : -2; if (bam_tag2cigar(b, 1, 1) < 0) return -2; - return ret; - } + break; case sam: { // Consume 1st line after header parsing as it wasn't using peek if (fp->line.l != 0) { - int ret = sam_parse1(&fp->line, h, b); + ret = sam_parse1(&fp->line, h, b); fp->line.l = 0; - return ret; + break; } if (fp->state) { @@ -3255,12 +3499,10 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) fd->curr_idx = 0; } - return 0; + ret = 0; } else { - int ret; err_recover: - ret = hts_getline(fp, KS_SEP_LINE, &fp->line); if (ret < 0) return ret; @@ -3270,8 +3512,8 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) hts_log_warning("Parse error at line %lld", (long long)fp->lineno); if (h->ignore_sam_err) goto err_recover; } - return ret; } + break; } case empty_format: @@ -3282,6 +3524,23 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) errno = EFTYPE; return -3; } + + if (ret >= 0 && fp->filter) { + // Process on-the-fly filter rules + hb_pair hb = {h, b}; + fexpr_t res; + if (sam_filter_eval(fp->filter, &hb, bam_sym_lookup, &res)) { + hts_log_error("Couldn't process filter expression"); + fexpr_free(&res); + return -1; + } + int t = res.is_true; + fexpr_free(&res); + if (!t) + goto filtered; + } + + return ret; } diff --git a/test/sam_filter/filter.sh b/test/sam_filter/filter.sh new file mode 100755 index 000000000..575cb13ae --- /dev/null +++ b/test/sam_filter/filter.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing sam_filter..." + +tv="../test_view" + +test_driver $@ + +exit $? diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst new file mode 100644 index 000000000..f5558a41b --- /dev/null +++ b/test/sam_filter/filter.tst @@ -0,0 +1,46 @@ +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $tv is replaced with the path to test_view + +# String matches +P string1.out $tv -i 'sam_filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam +P string2.out $tv -i 'sam_filter=rname=="CHROMOSOME_II"' ../ce#5b.sam +P string3.out $tv -i 'sam_filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam +P string4.out $tv -i 'sam_filter=cigar=~"D"' ../ce#1000.sam + +# Integer ops +P int1.out $tv -i 'sam_filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' +P int2.out $tv -i 'sam_filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' + +# Aux tags +P int3.out $tv -i 'sam_filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' diff --git a/test/sam_filter/int1.out b/test/sam_filter/int1.out new file mode 100644 index 000000000..6529ff889 --- /dev/null +++ b/test/sam_filter/int1.out @@ -0,0 +1 @@ +98 diff --git a/test/sam_filter/int2.out b/test/sam_filter/int2.out new file mode 100644 index 000000000..e522732c7 --- /dev/null +++ b/test/sam_filter/int2.out @@ -0,0 +1 @@ +38 diff --git a/test/sam_filter/int3.out b/test/sam_filter/int3.out new file mode 100644 index 000000000..ea70ce013 --- /dev/null +++ b/test/sam_filter/int3.out @@ -0,0 +1 @@ +72 diff --git a/test/sam_filter/string1.out b/test/sam_filter/string1.out new file mode 100644 index 000000000..7ba8527b2 --- /dev/null +++ b/test/sam_filter/string1.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 diff --git a/test/sam_filter/string2.out b/test/sam_filter/string2.out new file mode 100644 index 000000000..be94b071e --- /dev/null +++ b/test/sam_filter/string2.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string3.out b/test/sam_filter/string3.out new file mode 100644 index 000000000..2424115de --- /dev/null +++ b/test/sam_filter/string3.out @@ -0,0 +1,7 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +III 16 CHROMOSOME_III 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string4.out b/test/sam_filter/string4.out new file mode 100644 index 000000000..386300c53 --- /dev/null +++ b/test/sam_filter/string4.out @@ -0,0 +1,8 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.32874267 0 CHROMOSOME_I 75 1 13M1D87M * 0 0 CTAAGCCTAAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAG DCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCA>/=;=9>:/5AA############# AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:13^T87 NM:i:1 +SRR065390.723611 0 CHROMOSOME_I 155 1 5M1D95M * 0 0 AAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCCBCABCDADBACDDDBABBDB?AB@@<>;B>B?DB?=@@?@: AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:5^T95 NM:i:1 diff --git a/test/test_expr.c b/test/test_expr.c new file mode 100644 index 000000000..7ad83fcd2 --- /dev/null +++ b/test/test_expr.c @@ -0,0 +1,233 @@ +/* test-expr.c -- Testing: filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include "../expr.h" + +int lookup(void *data, char *str, char **end, fexpr_t *res) { + int foo = 15551; // my favourite palindromic prime + int a = 1; + int b = 2; + int c = 3; + res->is_str = 0; + if (strncmp(str, "foo", 3) == 0) { + *end = str+3; + res->d = foo; + } else if (*str == 'a') { + *end = str+1; + res->d = a; + } else if (*str == 'b') { + *end = str+1; + res->d = b; + } else if (*str == 'c') { + *end = str+1; + res->d = c; + } else if (strncmp(str, "magic", 5) == 0) { + // non-empty string + *end = str+5; + res->is_str = 1; + kputs("plugh", ks_clear(&res->s)); + } else if (strncmp(str, "empty", 5) == 0) { + // empty string + *end = str+5; + res->is_str = 1; + kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "null", 4) == 0) { + // null string (eg aux:Z tag is absent) + *end = str+4; + res->is_str = 1; + ks_clear(&res->s); + + } else { + return -1; + } + + return 0; +} + +typedef struct { + double dval; + char *sval; + char *str; +} test_ev; + +int test(void) { + // These are all valid expressions that should work + test_ev tests[] = { + { 1, NULL, "1"}, + { 1, NULL, "+1"}, + { -1, NULL, "-1"}, + { 0, NULL, "!7"}, + { 1, NULL, "!0"}, + { 1, NULL, "!(!7)"}, + { 1, NULL, "!!7"}, + + { 5, NULL, "2+3"}, + { -1, NULL, "2+-3"}, + { 6, NULL, "1+2+3"}, + { 1, NULL, "-2+3"}, + + { 6, NULL, "2*3"}, + { 6, NULL, "1*2*3"}, + { 0, NULL, "2*0"}, + + { 7, NULL, "(7)"}, + { 7, NULL, "((7))"}, + { 21, NULL, "(1+2)*(3+4)"}, + { 14, NULL, "(4*5)-(-2*-3)"}, + + { 1, NULL, "(1+2)*3==9"}, + { 1, NULL, "(1+2)*3!=8"}, + { 0, NULL, "(1+2)*3!=9"}, + { 0, NULL, "(1+2)*3==8"}, + + { 0, NULL, "1>2"}, + { 1, NULL, "1<2"}, + { 0, NULL, "3<3"}, + { 0, NULL, "3>3"}, + { 1, NULL, "9<=9"}, + { 1, NULL, "9>=9"}, + { 1, NULL, "2*4==8"}, + { 1, NULL, "16==0x10"}, + { 1, NULL, "15<0x10"}, + { 1, NULL, "17>0x10"}, + { 0, NULL, "2*4!=8"}, + { 1, NULL, "4+2<3+4"}, + { 0, NULL, "4*2<3+4"}, + { 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, NULL, "(1<2) == (3>2)"}, + { 1, NULL, "1<2 == 3>2"}, + + { 1, NULL, "2 && 1"}, + { 0, NULL, "2 && 0"}, + { 0, NULL, "0 && 2"}, + { 1, NULL, "2 || 1"}, + { 1, NULL, "2 || 0"}, + { 1, NULL, "0 || 2"}, + { 1, NULL, "1 || 2 && 3"}, + { 1, NULL, "2 && 3 || 1"}, + { 1, NULL, "0 && 3 || 2"}, + { 0, NULL, "0 && 3 || 0"}, + + { 1, NULL, "3 & 1"}, + { 2, NULL, "3 & 2"}, + { 3, NULL, "1 | 2"}, + { 3, NULL, "1 | 3"}, + { 7, NULL, "1 | 6"}, + { 2, NULL, "1 ^ 3"}, + + { 1, NULL, "(1^0)&(4^3)"}, + { 2, NULL, "1 ^(0&4)^ 3"}, + { 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 6, NULL, "(1|0)^(4|3)"}, + { 7, NULL, "1 |(0^4)| 3"}, + { 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, NULL, "4 & 2 || 1"}, + { 1, NULL, "(4 & 2) || 1"}, + { 0, NULL, "4 & (2 || 1)"}, + { 1, NULL, "1 || 4 & 2"}, + { 1, NULL, "1 || (4 & 2)"}, + { 0, NULL, "(1 || 4) & 2"}, + + { 1, NULL, " (2*3)&7 > 4"}, + { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, NULL, "((2*3)&7) > 4"}, // Python precendece equiv + { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, "plugh", "magic"}, + { 1, "", "empty"}, + { 1, NULL, "magic == \"plugh\""}, + { 1, NULL, "magic != \"xyzzy\""}, + + { 1, NULL, "\"abc\" < \"def\""}, + { 1, NULL, "\"abc\" <= \"abc\""}, + { 0, NULL, "\"abc\" < \"ab\""}, + { 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, NULL, "\"abc\" > \"def\""}, + { 1, NULL, "\"abc\" >= \"abc\""}, + { 1, NULL, "\"abc\" > \"ab\""}, + { 1, NULL, "\"abc\" >= \"ab\""}, + + { 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + }; + + int i; + fexpr_t r; + for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { + sam_filter_t *filt = sam_filter_init(tests[i].str); + if (!filt) + return 1; + if (sam_filter_eval(filt, NULL, lookup, &r)) { + fprintf(stderr, "Failed to parse filter string %s\n", + tests[i].str); + return 1; + } + + if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0 + || r.d != tests[i].dval)) { + fprintf(stderr, "Failed test: %s == %s, got %s, %f\n", + tests[i].str, tests[i].sval, r.s.s, r.d); + return 1; + } else if (!r.is_str && r.d != tests[i].dval) { + fprintf(stderr, "Failed test: %s == %f, got %f\n", + tests[i].str, tests[i].dval, r.d); + return 1; + } + + fexpr_free(&r); + sam_filter_free(filt); + } + + return 0; +} + +int main(int argc, char **argv) { + if (argc > 1) { + fexpr_t v; + sam_filter_t *filt = sam_filter_init(argv[1]); + if (sam_filter_eval(filt, NULL, lookup, &v)) + return 1; + + if (v.is_str) + puts(v.s.s); + else + printf("%g\n", v.d); + + fexpr_free(&v); + sam_filter_free(filt); + return 0; + } + + return test(); +} From 2c75c104d824b0469463c9a62f8daf1594f22ebc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 17:27:39 +0000 Subject: [PATCH 030/114] Filter tidy ups from review. - Rename all the sam_* types and functions in expr.c to hts_*. This file is intended to be file format agnostic. A few key functions here are now externally visible too. - Renamed fexpr_t to hts_expr_val_t. Similarly the other bits named after fexpr. It was a poor choice as it's not a filter expression, but a *value*. - Renamed expr.[ch] to hts_expr.[ch]. The header is now public too, but included by hts.h so it does not need to be explicitly included. - Fixed a bug where the multi-region CRAM iterator wasn't using the filter (it calls cram_readrec). All other iterators, both single and multi, worked fine. - Added a simpler sam_passes_filter function that combines hts_filter_eval with the internal bam_sym_lookup. This is an externally visible function. - Rename sym_func to hts_expr_sym_func. - Add more documentation. - Renamed HTS_OPT_SAM_FILTER back to HTS_OPT_FILTER. It doesn't yet check if you attempt to specify this on e.g. a VCF file, but neither does it break as it's simply ignored. - Restructured sam_read1 into a series of internal format-specific functions. This makes the body of sam_read1 itself far simpler. This now means we can do sensible looping without concern over indenting an enormous chunk of code, and the loop is now small enough to see the logic in a single page. --- Makefile | 7 +- expr.h | 64 --- hts.c | 18 +- expr.c => hts_expr.c | 113 ++--- htslib/hts.h | 6 +- htslib/hts_expr.h | 97 +++++ htslib/sam.h | 9 + htslib_vars.mk | 3 +- sam.c | 865 +++++++++++++++++++------------------ test/sam_filter/filter.tst | 14 +- test/test_expr.c | 24 +- 11 files changed, 651 insertions(+), 569 deletions(-) delete mode 100644 expr.h rename expr.c => hts_expr.c (83%) create mode 100644 htslib/hts_expr.h diff --git a/Makefile b/Makefile index bd091ae36..659936e2b 100644 --- a/Makefile +++ b/Makefile @@ -149,12 +149,12 @@ LIBHTS_OBJS = \ bcf_sr_sort.o \ bgzf.o \ errmod.o \ - expr.o \ faidx.o \ header.o \ hfile.o \ hfile_net.o \ hts.o \ + hts_expr.o \ hts_os.o\ md5.o \ multipart.o \ @@ -333,7 +333,6 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) -expr.o expr.pico: expr.c expr.h config.h $(htslib_kstring_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) @@ -343,7 +342,8 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h expr.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts_expr.o hts_expr.pico: hts_expr.c $(htslib_hts_expr_h) config.h $(htslib_kstring_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) @@ -493,6 +493,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) +test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) diff --git a/expr.h b/expr.h deleted file mode 100644 index c4674fc41..000000000 --- a/expr.h +++ /dev/null @@ -1,64 +0,0 @@ -/* expr.c -- filter expression parsing and processing. - - Copyright (C) 2020 Genome Research Ltd. - - Author: James Bonfield - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef HTS_EXPR_H -#define HTS_EXPR_H - -#include -#include - -// fexpr_t is our return type and the type for elements within the expr. -// Note we cope with zero-but-true in order to implement a basic -// "exists(something)" check where "something" may even be zero. -// -// Eg in the aux tag searching syntax, "[NM]" should return true if -// NM tag exists even if zero. -// Take care when negating this. "[NM] != 0" will be true when -// [NM] is absent, thus consider "[NM] && [NM] != 0". -typedef struct { - char is_str; // Use .s vs .d - char is_true; // Force true if even zero - kstring_t s; // is_str and empty s permitted (eval as false) - double d; // otherwise this -} fexpr_t; - -#define FEXPR_INIT {0, 0, KS_INITIALIZE, 0} - -// Create a SAM filter for expression "str". -// -// Returns a pointer on success, -// NULL on failure -sam_filter_t *sam_filter_init(const char *str); - -// Frees a sam_filter_t created via sam_filter_init -void sam_filter_free(sam_filter_t *filt); - -typedef int (sym_func)(void *data, char *str, char **end, fexpr_t *res); -int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *f, fexpr_t *res); - -static inline void fexpr_free(fexpr_t *f) { - ks_free(&f->s); -} - -#endif /* HTS_EXPR_H */ diff --git a/hts.c b/hts.c index 519b5b809..f854ed725 100644 --- a/hts.c +++ b/hts.c @@ -50,7 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" -#include "expr.h" +#include "htslib/hts_expr.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" @@ -825,9 +825,9 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "LEVEL") == 0) o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); - else if (strcmp(o->arg, "sam_filter") == 0 || - strcmp(o->arg, "SAM_FILTER") == 0) - o->opt = HTS_OPT_SAM_FILTER, o->val.s = val; + else if (strcmp(o->arg, "filter") == 0 || + strcmp(o->arg, "FILTER") == 0) + o->opt = HTS_OPT_FILTER, o->val.s = val; else { hts_log_error("Unknown option '%s'", o->arg); @@ -868,7 +868,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: - case HTS_OPT_SAM_FILTER: + case HTS_OPT_FILTER: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1237,7 +1237,7 @@ int hts_close(htsFile *fp) save = errno; sam_hdr_destroy(fp->bam_header); hts_idx_destroy(fp->idx); - sam_filter_free(fp->filter); + hts_filter_free(fp->filter); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -1342,7 +1342,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { fp->fp.bgzf->compress_level = level; } - case HTS_OPT_SAM_FILTER: { + case HTS_OPT_FILTER: { va_start(args, opt); char *expr = va_arg(args, char *); va_end(args); @@ -1413,12 +1413,12 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) int hts_set_filter_expression(htsFile *fp, const char *expr) { if (fp->filter) - sam_filter_free(fp->filter); + hts_filter_free(fp->filter); if (!expr) return 0; - return (fp->filter = sam_filter_init(expr)) + return (fp->filter = hts_filter_init(expr)) ? 0 : -1; } diff --git a/expr.c b/hts_expr.c similarity index 83% rename from expr.c rename to hts_expr.c index 73df14dd1..3d00ed339 100644 --- a/expr.c +++ b/hts_expr.c @@ -1,4 +1,4 @@ -/* expr.c -- filter expression parsing and processing. +/* hts_expr.c -- filter expression parsing and processing. Copyright (C) 2020 Genome Research Ltd. @@ -36,12 +36,12 @@ DEALINGS IN THE SOFTWARE. */ #include #include // may need configure rule for this -#include "expr.h" +#include "htslib/hts_expr.h" #include "textutils_internal.h" -// Could also cache fexpr_t stack here for kstring reuse? +// Could also cache hts_expr_val_t stack here for kstring reuse? #define MAX_REGEX 10 -struct sam_filter_t { +struct hts_filter_t { char *str; int parsed; int curr_regex, max_regex; @@ -73,8 +73,8 @@ static char *ws(char *str) { return str; } -static int expression(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res); +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res); /* * simple_expr @@ -83,8 +83,8 @@ static int expression(sam_filter_t *filt, void *data, sym_func *fn, * // | string ? * | '(' expression ')' */ -static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { // Main recursion step str = ws(str); if (*str == '(') { @@ -159,8 +159,8 @@ static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, * | '!' unary_expr // higher precedence * | '~' unary_expr // higher precedence */ -static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { int err; str = ws(str); if (*str == '+') { @@ -202,19 +202,19 @@ static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, * | unary_expr '%' unary_expr * )* */ -static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (unary_expr(filt, data, fn, str, end, res)) return -1; str = *end; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '*' || *str == '/' || *str == '%') { if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; // arith on strings } } @@ -230,7 +230,7 @@ static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, str = *end; } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -242,19 +242,19 @@ static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, * | mul_expr '-' mul_expr * )* */ -static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (mul_expr(filt, data, fn, str, end, res)) return -1; str = *end; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '+' || *str == '-') { if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; // arith on strings } } @@ -268,7 +268,7 @@ static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, str = *end; } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -278,17 +278,17 @@ static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, * : add_expr * | bitand_expr '&' add_expr */ -static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (add_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '&' && str[1] != '&') { if (add_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; @@ -296,7 +296,7 @@ static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -306,17 +306,17 @@ static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, * : bitand_expr * | bitxor_expr '^' bitand_expr */ -static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitand_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '^') { if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; @@ -324,7 +324,7 @@ static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -334,17 +334,17 @@ static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, * : xor_expr * | bitor_expr '|' xor_expr */ -static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitxor_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '|' && str[1] != '|') { if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; @@ -352,7 +352,7 @@ static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -365,12 +365,12 @@ static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, * | cmp_expr '>=' bitor_expr * | cmp_expr '>' bitor_expr */ -static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitor_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; int err = 0; if (*str == '>' && str[1] == '=') { @@ -398,7 +398,7 @@ static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, : !res->is_str && !val.is_str && res->d < val.d; res->is_str = 0; } - fexpr_free(&val); + hts_expr_val_free(&val); return err ? -1 : 0; } @@ -411,14 +411,14 @@ static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, * | eq_expr '=~' cmp_expr * | eq_expr '!~' cmp_expr */ -static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (cmp_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); int err = 0; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; // numeric vs numeric comparison is as expected // string vs string comparison is as expected @@ -447,7 +447,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, (str[0] == '!' && str[1] == '~')) { err = eq_expr(filt, data, fn, str+2, end, &val); if (!val.is_str || !res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { @@ -466,7 +466,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, char errbuf[1024]; regerror(ec, preg, errbuf, 1024); fprintf(stderr, "Failed regex: %.1024s\n", errbuf); - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } } else { @@ -485,7 +485,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, } res->is_str = 0; } - fexpr_free(&val); + hts_expr_val_free(&val); return err ? -1 : 0; } @@ -496,11 +496,11 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, * | and_expr 'and' eq_expr * | and_expr 'or' eq_expr */ -static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (eq_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (str[0] == '&' && str[1] == '&') { @@ -519,18 +519,18 @@ static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } -static int expression(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { return and_expr(filt, data, fn, str, end, res); } -sam_filter_t *sam_filter_init(const char *str) { - sam_filter_t *f = calloc(1, sizeof(*f)); +hts_filter_t *hts_filter_init(const char *str) { + hts_filter_t *f = calloc(1, sizeof(*f)); if (!f) return NULL; // Oversize to permit faster comparisons with memcmp over strcmp @@ -543,7 +543,7 @@ sam_filter_t *sam_filter_init(const char *str) { return f; } -void sam_filter_free(sam_filter_t *filt) { +void hts_filter_free(hts_filter_t *filt) { if (!filt) return; @@ -555,8 +555,9 @@ void sam_filter_free(sam_filter_t *filt) { free(filt); } -int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *fn, - fexpr_t *res) { +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { char *end = NULL; memset(res, 0, sizeof(*res)); diff --git a/htslib/hts.h b/htslib/hts.h index 9ccbb986b..8360e2cf1 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_log.h" +#include "hts_expr.h" #include "kstring.h" #include "kroundup.h" @@ -224,7 +225,6 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; -typedef struct sam_filter_t sam_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -257,7 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; - sam_filter_t *filter; + hts_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. @@ -323,7 +323,7 @@ enum hts_fmt_option { HTS_OPT_THREAD_POOL, HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, - HTS_OPT_SAM_FILTER, + HTS_OPT_FILTER, }; // For backwards compatibility diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h new file mode 100644 index 000000000..c628d2e6f --- /dev/null +++ b/htslib/hts_expr.h @@ -0,0 +1,97 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTS_EXPR_H +#define HTS_EXPR_H + +#include +#include "hts_defs.h" + +/// Holds a filter variable. This is also used to return the results. +/** + * Note we cope with zero-but-true in order to implement a basic + * "exists(something)" check where "something" may even be zero. + * + * Eg in the aux tag searching syntax, "[NM]" should return true if + * NM tag exists even if zero. + * Take care when negating this. "[NM] != 0" will be true when + * [NM] is absent, thus consider "[NM] && [NM] != 0". + */ +typedef struct { + char is_str; // Use .s vs .d + char is_true; // Force true if even zero + kstring_t s; // is_str and empty s permitted (eval as false) + double d; // otherwise this +} hts_expr_val_t; + +/// Frees a hts_expr_val_t type. +static inline void hts_expr_val_free(hts_expr_val_t *f) { + ks_free(&f->s); +} + +/// Opaque hts_filter_t type. Definition in hts_expr.c +typedef struct hts_filter_t hts_filter_t; + +/// For static initialisation of hts_expr_val_t values +#define HTS_EXPR_VAL_INIT {0, 0, KS_INITIALIZE, 0} + +/// Creates a filter for expression "str". +/** @param str The filter expression + * @return A pointer on success, NULL on failure + */ +HTSLIB_EXPORT +hts_filter_t *hts_filter_init(const char *str); + +/// Frees an hts_filter_t created via hts_filter_init +/** @param filt The filter pointer. + */ +HTSLIB_EXPORT +void hts_filter_free(hts_filter_t *filt); + +/// Type for expression symbol lookups; name -> value. +typedef int (hts_expr_sym_func)(void *data, char *str, char **end, + hts_expr_val_t *res); + +/// Evaluates a filter expression and returns the value +/** @param filt The filter, produced by hts_filter_init + * @param data Arbitrary caller data, passed into sym_func + * @param sym_func Callback function to lookup variables. + * @param res Filled out with the result of the filter evaluation + * @return Returns 0 on success, -1 on failure + * + * sym_func and data may be NULL if the caller does not need its own data + * pointer or if it has no variables to lookup. + * + * The type of the returned result may be numeric of string, as defined by + * the is_str member. It can also be explicitly defined to be true even + * for a null value. This may be used to check for the existance of + * something, irrespective of whether that something evaluates to zero. + */ +HTSLIB_EXPORT +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res); + + +#endif /* HTS_EXPR_H */ diff --git a/htslib/sam.h b/htslib/sam.h index bfd7855a6..3c7b9b693 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1408,6 +1408,15 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +/// sam_passes_filter - Checks whether a record passes an hts_filter. +/** @param h Pointer to the header structure previously read + * @param b Pointer to the BAM record to be checked + * @param filt Pointer to the filter, created from hts_filter_init. + * @return 1 if passes, 0 if not, and <0 on error. + */ +HTSLIB_EXPORT +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt); + /************************************* *** Manipulating auxiliary fields *** *************************************/ diff --git a/htslib_vars.mk b/htslib_vars.mk index f22c7c22b..fa907f50f 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -29,9 +29,10 @@ htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h) htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_defs_h) $(htslib_hts_h) $(htslib_sam_h) htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) $(htslib_hts_h) htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) -htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) +htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) $(htslib_hts_expr_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h +htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h diff --git a/sam.c b/sam.c index 93a9a994a..d044046eb 100644 --- a/sam.c +++ b/sam.c @@ -50,7 +50,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "header.h" -#include "expr.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) @@ -1099,20 +1098,288 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_po return ret; } +// Bam record pointer and SAM header combined +typedef struct { + const sam_hdr_t *h; + const bam1_t *b; +} hb_pair; + +// Looks up variable names in str and replaces them with their value. +// Also supports aux tags. +// +// Note the expression parser deliberately overallocates str size so it +// is safe to use memcmp over strcmp. +static int bam_sym_lookup(void *data, char *str, char **end, + hts_expr_val_t *res) { + hb_pair *hb = (hb_pair *)data; + const bam1_t *b = hb->b; + + res->is_str = 0; + switch(*str) { + case 'c': + if (memcmp(str, "cigar", 5) == 0) { + *end = str+5; + res->is_str = 1; + ks_clear(&res->s); + uint32_t *cigar = bam_get_cigar(b); + int i, n = b->core.n_cigar, r = 0; + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); + } + kputs("", &res->s); + return r ? 0 : -1; + } + break; + + case 'f': + if (memcmp(str, "flag", 4) == 0) { + str = *end = str+4; + if (*str != '.') { + res->d = b->core.flag; + return 0; + } else { + str++; + if (!memcmp(str, "paired", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FPAIRED; + return 0; + } else if (!memcmp(str, "proper_pair", 11)) { + *end = str+11; + res->d = b->core.flag & BAM_FPROPER_PAIR; + return 0; + } else if (!memcmp(str, "unmap", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FUNMAP; + return 0; + } else if (!memcmp(str, "munmap", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FMUNMAP; + return 0; + } else if (!memcmp(str, "reverse", 7)) { + *end = str+7; + res->d = b->core.flag & BAM_FREVERSE; + return 0; + } else if (!memcmp(str, "mreverse", 8)) { + *end = str+8; + res->d = b->core.flag & BAM_FMREVERSE; + return 0; + } else if (!memcmp(str, "read1", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD1; + return 0; + } else if (!memcmp(str, "read2", 6)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD2; + return 0; + } else if (!memcmp(str, "secondary", 9)) { + *end = str+9; + res->d = b->core.flag & BAM_FSECONDARY; + return 0; + } else if (!memcmp(str, "qcfail", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FQCFAIL; + return 0; + } else if (!memcmp(str, "dup", 3)) { + *end = str+3; + res->d = b->core.flag & BAM_FDUP; + return 0; + } else if (!memcmp(str, "supplementary", 13)) { + *end = str+13; + res->d = b->core.flag & BAM_FSUPPLEMENTARY; + return 0; + } else { + hts_log_error("Unrecognised flag string"); + return -1; + } + } + } + break; + + case 'm': + if (memcmp(str, "mapq", 4) == 0) { + *end = str+4; + res->d = b->core.qual; + return 0; + } else if (memcmp(str, "mpos", 4) == 0) { + *end = str+4; + res->d = b->core.mpos+1; + return 0; + } else if (memcmp(str, "mrname", 6) == 0) { + *end = str+6; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "mrefid", 6) == 0) { + *end = str+6; + res->d = b->core.mtid; + return 0; + } + break; + + case 'n': + if (memcmp(str, "ncigar", 6) == 0) { + *end = str+6; + res->d = b->core.n_cigar; + return 0; + } + break; + + case 'p': + if (memcmp(str, "pos", 3) == 0) { + *end = str+3; + res->d = b->core.pos+1; + return 0; + } else if (memcmp(str, "pnext", 5) == 0) { + *end = str+5; + res->d = b->core.mpos+1; + return 0; + } + break; + + case 'q': + if (memcmp(str, "qlen", 4) == 0) { + *end = str+4; + res->d = b->core.l_qseq; + return 0; + } else if (memcmp(str, "qname", 5) == 0) { + *end = str+5; + res->is_str = 1; + kputs(bam_get_qname(b), ks_clear(&res->s)); + return 0; + } + break; + + case 'r': + if (memcmp(str, "rlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "rname", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "rnext", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "refid", 5) == 0) { + *end = str+5; + res->d = b->core.tid; + return 0; + } + break; + + case 't': + if (memcmp(str, "tlen", 4) == 0) { + *end = str+4; + res->d = b->core.isize; + return 0; + } + break; + + case '[': + if (*str == '[' && str[1] && str[2] && str[3] == ']') { + /* aux tags */ + *end = str+4; + + uint8_t *aux = bam_aux_get(b, str+1); + if (aux) { + // we define the truth of a tag to be its presence, even if 0. + res->is_true = 1; + switch (*aux) { + case 'Z': + case 'H': + res->is_str = 1; + kputs((char *)aux+1, ks_clear(&res->s)); + break; + + case 'A': + res->is_str = 1; + kputsn((char *)aux+1, 1, ks_clear(&res->s)); + break; + + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + res->is_str = 0; + res->d = bam_aux2i(aux); + break; + + case 'f': + case 'd': + res->is_str = 0; + res->d = bam_aux2f(aux); + break; + + default: + hts_log_error("Aux type '%c not yet supported by filters", + *aux); + return -1; + } + return 0; + + } else { + // hence absent tags are always false (and strings) + res->is_str = 1; + res->s.l = 0; + res->d = 0; + res->is_true = 0; + return 0; + } + } + break; + } + + // All successful matches in switch should return 0. + // So if we didn't match, it's a parse error. + return -1; +} + +// Returns 1 when accepted by the filter, 0 if not, -1 on error. +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) +{ + hb_pair hb = {h, b}; + hts_expr_val_t res; + if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) { + hts_log_error("Couldn't process filter expression"); + hts_expr_val_free(&res); + return -1; + } + + int t = res.is_true; + hts_expr_val_free(&res); + + return t; +} + static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; - int ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; + int filtered, ret; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; + do { + ret = cram_get_bam_seq(fp->fp.cram, &b); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; - *tid = b->core.tid; - *beg = b->core.pos; - *end = bam_endpos(b); + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + + *tid = b->core.tid; + *beg = b->core.pos; + *end = bam_endpos(b); + + filtered = sam_passes_filter(fp->bam_header, b, fp->filter); + if (filtered < 0) + return -2; + } while (filtered == 0); return ret; } @@ -3096,451 +3363,221 @@ static void *sam_format_worker(void *arg) { } pthread_mutex_unlock(&fd->lines_m); - return gl; - - err: - // Possible race between this and fd->curr_bam. - // Easier to not free and leave it on the input list so it - // gets freed there instead? - // sam_free_sp_bams(gb); - if (gl) { - free(gl->data); - free(gl); - } - return NULL; -} - -int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { - if (fp->state) - return 0; - - if (!(fp->state = sam_state_create(fp))) - return -1; - SAM_state *fd = (SAM_state *)fp->state; - - pthread_mutex_init(&fd->lines_m, NULL); - pthread_mutex_init(&fd->command_m, NULL); - pthread_cond_init(&fd->command_c, NULL); - fd->p = p->pool; - int qsize = p->qsize; - if (!qsize) - qsize = 2*hts_tpool_size(fd->p); - fd->q = hts_tpool_process_init(fd->p, qsize, 0); - if (!fd->q) { - sam_state_destroy(fp); - return -1; - } - - if (fp->format.compression == bgzf) - return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); - - return 0; -} - -int sam_set_threads(htsFile *fp, int nthreads) { - if (nthreads <= 0) - return 0; - - htsThreadPool p; - p.pool = hts_tpool_init(nthreads); - p.qsize = nthreads*2; - - int ret = sam_set_thread_pool(fp, &p); - if (ret < 0) - return ret; - - SAM_state *fd = (SAM_state *)fp->state; - fd->own_pool = 1; - - return 0; -} - -// Bam record pointer and SAM header combined -typedef struct { - const sam_hdr_t *h; - const bam1_t *b; -} hb_pair; - -// Looks up variable names in str and replaces them with their value. -// Also supports aux tags. -// -// Note the expression parser deliberately overallocates str size so it -// is safe to use memcmp over strcmp. -static int bam_sym_lookup(void *data, char *str, char **end, fexpr_t *res) { - hb_pair *hb = (hb_pair *)data; - const bam1_t *b = hb->b; - - res->is_str = 0; - switch(*str) { - case 'c': - if (memcmp(str, "cigar", 5) == 0) { - *end = str+5; - res->is_str = 1; - ks_clear(&res->s); - uint32_t *cigar = bam_get_cigar(b); - int i, n = b->core.n_cigar, r = 0; - for (i = 0; i < n; i++) { - r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); - r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); - } - kputs("", &res->s); - return r ? 0 : -1; - } - break; - - case 'f': - if (memcmp(str, "flag", 4) == 0) { - str = *end = str+4; - if (*str != '.') { - res->d = b->core.flag; - return 0; - } else { - str++; - if (!memcmp(str, "paired", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FPAIRED; - return 0; - } else if (!memcmp(str, "proper_pair", 11)) { - *end = str+11; - res->d = b->core.flag & BAM_FPROPER_PAIR; - return 0; - } else if (!memcmp(str, "unmap", 5)) { - *end = str+5; - res->d = b->core.flag & BAM_FUNMAP; - return 0; - } else if (!memcmp(str, "munmap", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FMUNMAP; - return 0; - } else if (!memcmp(str, "reverse", 7)) { - *end = str+7; - res->d = b->core.flag & BAM_FREVERSE; - return 0; - } else if (!memcmp(str, "mreverse", 8)) { - *end = str+8; - res->d = b->core.flag & BAM_FMREVERSE; - return 0; - } else if (!memcmp(str, "read1", 5)) { - *end = str+5; - res->d = b->core.flag & BAM_FREAD1; - return 0; - } else if (!memcmp(str, "read2", 6)) { - *end = str+5; - res->d = b->core.flag & BAM_FREAD2; - return 0; - } else if (!memcmp(str, "secondary", 9)) { - *end = str+9; - res->d = b->core.flag & BAM_FSECONDARY; - return 0; - } else if (!memcmp(str, "qcfail", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FQCFAIL; - return 0; - } else if (!memcmp(str, "dup", 3)) { - *end = str+3; - res->d = b->core.flag & BAM_FDUP; - return 0; - } else if (!memcmp(str, "supplementary", 13)) { - *end = str+13; - res->d = b->core.flag & BAM_FSUPPLEMENTARY; - return 0; - } else { - hts_log_error("Unrecognised flag string"); - return -1; - } - } - } - break; - - case 'm': - if (memcmp(str, "mapq", 4) == 0) { - *end = str+4; - res->d = b->core.qual; - return 0; - } else if (memcmp(str, "mpos", 4) == 0) { - *end = str+4; - res->d = b->core.mpos+1; - return 0; - } else if (memcmp(str, "mrname", 6) == 0) { - *end = str+6; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "mrefid", 6) == 0) { - *end = str+6; - res->d = b->core.mtid; - return 0; - } - break; + return gl; - case 'n': - if (memcmp(str, "ncigar", 6) == 0) { - *end = str+6; - res->d = b->core.n_cigar; - return 0; - } - break; + err: + // Possible race between this and fd->curr_bam. + // Easier to not free and leave it on the input list so it + // gets freed there instead? + // sam_free_sp_bams(gb); + if (gl) { + free(gl->data); + free(gl); + } + return NULL; +} - case 'p': - if (memcmp(str, "pos", 3) == 0) { - *end = str+3; - res->d = b->core.pos+1; - return 0; - } else if (memcmp(str, "pnext", 5) == 0) { - *end = str+5; - res->d = b->core.mpos+1; - return 0; - } - break; +int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { + if (fp->state) + return 0; - case 'q': - if (memcmp(str, "qlen", 4) == 0) { - *end = str+4; - res->d = b->core.l_qseq; - return 0; - } else if (memcmp(str, "qname", 5) == 0) { - *end = str+5; - res->is_str = 1; - kputs(bam_get_qname(b), ks_clear(&res->s)); - return 0; - } - break; + if (!(fp->state = sam_state_create(fp))) + return -1; + SAM_state *fd = (SAM_state *)fp->state; - case 'r': - if (memcmp(str, "rlen", 4) == 0) { - *end = str+4; - res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); - return 0; - } else if (memcmp(str, "rname", 5) == 0) { - *end = str+5; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "rnext", 5) == 0) { - *end = str+5; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "refid", 5) == 0) { - *end = str+5; - res->d = b->core.tid; - return 0; - } - break; + pthread_mutex_init(&fd->lines_m, NULL); + pthread_mutex_init(&fd->command_m, NULL); + pthread_cond_init(&fd->command_c, NULL); + fd->p = p->pool; + int qsize = p->qsize; + if (!qsize) + qsize = 2*hts_tpool_size(fd->p); + fd->q = hts_tpool_process_init(fd->p, qsize, 0); + if (!fd->q) { + sam_state_destroy(fp); + return -1; + } - case 't': - if (memcmp(str, "tlen", 4) == 0) { - *end = str+4; - res->d = b->core.isize; - return 0; - } - break; + if (fp->format.compression == bgzf) + return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); - case '[': - if (*str == '[' && str[1] && str[2] && str[3] == ']') { - /* aux tags */ - *end = str+4; + return 0; +} - uint8_t *aux = bam_aux_get(b, str+1); - if (aux) { - // we define the truth of a tag to be its presence, even if 0. - res->is_true = 1; - switch (*aux) { - case 'Z': - case 'H': - res->is_str = 1; - kputs((char *)aux+1, ks_clear(&res->s)); - break; +int sam_set_threads(htsFile *fp, int nthreads) { + if (nthreads <= 0) + return 0; - case 'A': - res->is_str = 1; - kputsn((char *)aux+1, 1, ks_clear(&res->s)); - break; + htsThreadPool p; + p.pool = hts_tpool_init(nthreads); + p.qsize = nthreads*2; - case 'i': case 'I': - case 's': case 'S': - case 'c': case 'C': - res->is_str = 0; - res->d = bam_aux2i(aux); - break; + int ret = sam_set_thread_pool(fp, &p); + if (ret < 0) + return ret; - case 'f': - case 'd': - res->is_str = 0; - res->d = bam_aux2f(aux); - break; + SAM_state *fd = (SAM_state *)fp->state; + fd->own_pool = 1; - default: - hts_log_error("Aux type '%c not yet supported by filters", - *aux); - return -1; - } - return 0; + return 0; +} - } else { - // hence absent tags are always false (and strings) - res->is_str = 1; - res->s.l = 0; - res->d = 0; - res->is_true = 0; - return 0; - } +// Internal component of sam_read1 below +static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret = bam_read1(fp->fp.bgzf, b); + if (h && ret >= 0) { + if (b->core.tid >= h->n_targets || b->core.tid < -1 || + b->core.mtid >= h->n_targets || b->core.mtid < -1) { + errno = ERANGE; + return -3; } - break; } + return ret; +} - // All successful matches in switch should return 0. - // So if we didn't match, it's a parse error. - return -1; +// Internal component of sam_read1 below +static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { + int ret = cram_get_bam_seq(fp->fp.cram, b); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(*b, 1, 1) < 0) + return -2; + + return ret; } -// Returns 0 on success, -// -1 on EOF, -// <-1 on error -int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) -{ +// Internal component of sam_read1 below +static int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret; - filtered: - switch (fp->format.format) { - case bam: - ret = bam_read1(fp->fp.bgzf, b); - if (h && ret >= 0) { - if (b->core.tid >= h->n_targets || b->core.tid < -1 || - b->core.mtid >= h->n_targets || b->core.mtid < -1) { - errno = ERANGE; - return -3; + // Consume 1st line after header parsing as it wasn't using peek + if (fp->line.l != 0) { + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + return ret; + } + + if (fp->state) { + SAM_state *fd = (SAM_state *)fp->state; + + if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { + // We don't support multi-threaded SAM parsing with seeks yet. + int ret; + if ((ret = sam_state_destroy(fp)) < 0) { + errno = -ret; + return -2; } + if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) + return -1; + fp->fp.bgzf->seeked = 0; + goto err_recover; } - break; - - case cram: - ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; - break; + if (!fd->h) { + fd->h = h; + fd->h->ref_count++; + // Ensure hrecs is initialised now as we don't want multiple + // threads trying to do this simultaneously. + if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) + return -2; - case sam: { - // Consume 1st line after header parsing as it wasn't using peek - if (fp->line.l != 0) { - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - break; + // We can only do this once we've got a header + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, + fp) != 0) + return -2; + fd->dispatcher_set = 1; } - if (fp->state) { - SAM_state *fd = (SAM_state *)fp->state; + if (fd->h != h) { + hts_log_error("SAM multi-threaded decoding does not support changing header"); + return -1; + } - if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { - // We don't support multi-threaded SAM parsing with seeks yet. - int ret; - if ((ret = sam_state_destroy(fp)) < 0) { - errno = -ret; - return -2; - } - if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) - return -1; - fp->fp.bgzf->seeked = 0; - goto err_recover; + sp_bams *gb = fd->curr_bam; + if (!gb) { + if (fd->errcode) { + // In case reader failed + errno = fd->errcode; + return -2; } + hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); + if (!r) + return -2; + fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); + hts_tpool_delete_result(r, 0); + } + if (!gb) + return fd->errcode ? -2 : -1; + bam1_t *b_array = (bam1_t *)gb->bams; + if (fd->curr_idx < gb->nbams) + if (!bam_copy1(b, &b_array[fd->curr_idx++])) + return -2; + if (fd->curr_idx == gb->nbams) { + pthread_mutex_lock(&fd->lines_m); + gb->next = fd->bams; + fd->bams = gb; + pthread_mutex_unlock(&fd->lines_m); - if (!fd->h) { - fd->h = h; - fd->h->ref_count++; - // Ensure hrecs is initialised now as we don't want multiple - // threads trying to do this simultaneously. - if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) - return -2; + fd->curr_bam = NULL; + fd->curr_idx = 0; + } - // We can only do this once we've got a header - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, - fp) != 0) - return -2; - fd->dispatcher_set = 1; - } + ret = 0; - if (fd->h != h) { - hts_log_error("SAM multi-threaded decoding does not support changing header"); - return -1; - } + } else { + err_recover: + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); + if (ret < 0) return ret; - sp_bams *gb = fd->curr_bam; - if (!gb) { - if (fd->errcode) { - // In case reader failed - errno = fd->errcode; - return -2; - } - hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); - if (!r) - return -2; - fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); - hts_tpool_delete_result(r, 0); - } - if (!gb) - return fd->errcode ? -2 : -1; - bam1_t *b_array = (bam1_t *)gb->bams; - if (fd->curr_idx < gb->nbams) - if (!bam_copy1(b, &b_array[fd->curr_idx++])) - return -2; - if (fd->curr_idx == gb->nbams) { - pthread_mutex_lock(&fd->lines_m); - gb->next = fd->bams; - fd->bams = gb; - pthread_mutex_unlock(&fd->lines_m); + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + if (ret < 0) { + hts_log_warning("Parse error at line %lld", (long long)fp->lineno); + if (h->ignore_sam_err) goto err_recover; + } + } - fd->curr_bam = NULL; - fd->curr_idx = 0; - } + return ret; +} - ret = 0; +// Returns 0 on success, +// -1 on EOF, +// <-1 on error +int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) +{ + int ret, pass_filter; - } else { - err_recover: - ret = hts_getline(fp, KS_SEP_LINE, &fp->line); - if (ret < 0) return ret; + do { + switch (fp->format.format) { + case bam: + ret = sam_read1_bam(fp, h, b); + break; - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - if (ret < 0) { - hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; - } - } - break; - } + case cram: + ret = sam_read1_cram(fp, h, &b); + break; - case empty_format: - errno = EPIPE; - return -3; + case sam: { + ret = sam_read1_sam(fp, h, b); + break; + } - default: - errno = EFTYPE; - return -3; - } + case empty_format: + errno = EPIPE; + return -3; - if (ret >= 0 && fp->filter) { - // Process on-the-fly filter rules - hb_pair hb = {h, b}; - fexpr_t res; - if (sam_filter_eval(fp->filter, &hb, bam_sym_lookup, &res)) { - hts_log_error("Couldn't process filter expression"); - fexpr_free(&res); - return -1; + default: + errno = EFTYPE; + return -3; } - int t = res.is_true; - fexpr_free(&res); - if (!t) - goto filtered; - } - return ret; + pass_filter = (ret >= 0 && fp->filter) + ? sam_passes_filter(h, b, fp->filter) + : 1; + } while (pass_filter == 0); + + return pass_filter < 0 ? -2 : ret; } diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index f5558a41b..4831c8df8 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -33,14 +33,14 @@ # Command to execute. $tv is replaced with the path to test_view # String matches -P string1.out $tv -i 'sam_filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam -P string2.out $tv -i 'sam_filter=rname=="CHROMOSOME_II"' ../ce#5b.sam -P string3.out $tv -i 'sam_filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam -P string4.out $tv -i 'sam_filter=cigar=~"D"' ../ce#1000.sam +P string1.out $tv -i 'filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam +P string2.out $tv -i 'filter=rname=="CHROMOSOME_II"' ../ce#5b.sam +P string3.out $tv -i 'filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam +P string4.out $tv -i 'filter=cigar=~"D"' ../ce#1000.sam # Integer ops -P int1.out $tv -i 'sam_filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' -P int2.out $tv -i 'sam_filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' +P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' +P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' # Aux tags -P int3.out $tv -i 'sam_filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' +P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' diff --git a/test/test_expr.c b/test/test_expr.c index 7ad83fcd2..258548b88 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -26,9 +26,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "../expr.h" +#include "../htslib/hts_expr.h" -int lookup(void *data, char *str, char **end, fexpr_t *res) { +int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { int foo = 15551; // my favourite palindromic prime int a = 1; int b = 2; @@ -183,12 +183,12 @@ int test(void) { }; int i; - fexpr_t r; + hts_expr_val_t r; for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { - sam_filter_t *filt = sam_filter_init(tests[i].str); + hts_filter_t *filt = hts_filter_init(tests[i].str); if (!filt) return 1; - if (sam_filter_eval(filt, NULL, lookup, &r)) { + if (hts_filter_eval(filt, NULL, lookup, &r)) { fprintf(stderr, "Failed to parse filter string %s\n", tests[i].str); return 1; @@ -205,8 +205,8 @@ int test(void) { return 1; } - fexpr_free(&r); - sam_filter_free(filt); + hts_expr_val_free(&r); + hts_filter_free(filt); } return 0; @@ -214,9 +214,9 @@ int test(void) { int main(int argc, char **argv) { if (argc > 1) { - fexpr_t v; - sam_filter_t *filt = sam_filter_init(argv[1]); - if (sam_filter_eval(filt, NULL, lookup, &v)) + hts_expr_val_t v; + hts_filter_t *filt = hts_filter_init(argv[1]); + if (hts_filter_eval(filt, NULL, lookup, &v)) return 1; if (v.is_str) @@ -224,8 +224,8 @@ int main(int argc, char **argv) { else printf("%g\n", v.d); - fexpr_free(&v); - sam_filter_free(filt); + hts_expr_val_free(&v); + hts_filter_free(filt); return 0; } From fc2d03e70f60bf553303a2fbb6211fbfa59822dd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 25 Nov 2020 16:10:22 +0000 Subject: [PATCH 031/114] Add missing return to HTS_OPT_COMPRESSION_LEVEL option. --- hts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hts.c b/hts.c index f854ed725..9e50290cf 100644 --- a/hts.c +++ b/hts.c @@ -1340,6 +1340,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; + return 0; } case HTS_OPT_FILTER: { From c7c433fd314d70b650e31af4c6b07f004bf17962 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 30 Nov 2020 16:43:50 +0000 Subject: [PATCH 032/114] Filter "qlen" tweak, add seq/qual/library, and add functions. "seq", "qual" and "library" are string variables, so we can now match them with e.g. seq =~ "(AT){10}". We have basic functions that operate on strings, interpreting them as arrays of characters. (This could be updated for B aux arrays and larger objects in the future.) Functions are length, min, max and avg. For example avg(qual) or length(seq). Note qual here is binary, so NOT the ASCII phred+33. I wouldn't advise trying to match it in a regexp. "qlen" was previously length of sequence, and not the value of cigar2qlen. This differs when we have CIGAR but seq is "*". Use the cigar2qlen definition so it is symmetric with our "rlen" variable. To permit the previous definition, we can now do "length(seq)". Note that min and max of empty string is NAN. This isn't treated as an error. This permits us to do "avg(qual) > 10" where some entries have seq and qual both as "*" (ie zero length). --- Makefile | 4 +- hts_expr.c | 154 ++++++++++++++++++++++++++++++++---- htslib.mk | 2 + htslib/hts.h | 4 +- htslib/hts_expr.h | 6 +- htslib/sam.h | 6 +- htslib_vars.mk | 4 +- sam.c | 87 +++++++++++++++++--- test/sam_filter/filter.tst | 9 +++ test/sam_filter/func1.out | 1 + test/sam_filter/func2.out | 1 + test/sam_filter/func3.out | 1 + test/sam_filter/func4.out | 1 + test/sam_filter/string5.out | 6 ++ test/sam_filter/string6.out | 9 +++ test/sam_filter/string7.out | 11 +++ test/test_expr.c | 2 +- 17 files changed, 270 insertions(+), 38 deletions(-) create mode 100644 test/sam_filter/func1.out create mode 100644 test/sam_filter/func2.out create mode 100644 test/sam_filter/func3.out create mode 100644 test/sam_filter/func4.out create mode 100644 test/sam_filter/string5.out create mode 100644 test/sam_filter/string6.out create mode 100644 test/sam_filter/string7.out diff --git a/Makefile b/Makefile index 659936e2b..840d9a2f8 100644 --- a/Makefile +++ b/Makefile @@ -343,10 +343,10 @@ hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_kn hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) -hts_expr.o hts_expr.pico: hts_expr.c $(htslib_hts_expr_h) config.h $(htslib_kstring_h) +hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) -sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) diff --git a/hts_expr.c b/hts_expr.c index 3d00ed339..1a1b9c52d 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -23,9 +23,10 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // TODO: -// - add maths functions. pow, sqrt, log, min, max, ? +// - add maths functions. pow, sqrt, log, ? // - ?: operator for conditionals? +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -34,7 +35,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include // may need configure rule for this +#include +#include #include "htslib/hts_expr.h" #include "textutils_internal.h" @@ -60,9 +62,9 @@ struct hts_filter_t { * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) * Math ops: *, /, % [TODO: add // for floor division?] * Math ops: +, - - * Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] + * Bit-wise: &, ^, | [NB as 3 precedence levels, in that order] * Conditionals: >, >=, <, <=, - * Equality: ==, !=, =~ !~ + * Equality: ==, !=, =~, !~ * Boolean: &&, || */ @@ -76,11 +78,127 @@ static char *ws(char *str) { static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res); +/* + * Simple functions operating on strings only. + * length, min, max, avg. + * + * All return 0 on success, + * -1 on failure + */ +static int expr_func_length(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + res->is_str = 0; + res->d = res->s.l; + return 0; +} + +static int expr_func_min(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MAX; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v > x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MAX ? NAN : v; + + return 0; +} + +static int expr_func_max(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MIN; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v < x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MIN ? NAN : v; + + return 0; +} + +static int expr_func_avg(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + double v = 0; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + v += x[l]; + if (l) + v /= l; + + res->is_str = 0; + res->d = v; + + return 0; +} + +/* + * functions: FUNC(expr). + * Note for simplicity of parsing, the "(" must immediately follow FUNC, + * so "FUNC (x)" is invalid. + */ +static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + int func_ok = -1; + switch (*str) { + case 'a': + if (strncmp(str, "avg(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_avg(res); + } + break; + + case 'l': + if (strncmp(str, "length(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = expr_func_length(res); + } + break; + + case 'm': + if (strncmp(str, "min(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_min(res); + } else if (strncmp(str, "max(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_max(res); + } + break; + } + + if (func_ok < 0) + return -1; + + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; +} + /* * simple_expr * : identifier * | constant - * // | string ? + * | string + * | func_expr * | '(' expression ')' */ static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -141,11 +259,15 @@ static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (*e != '"') return -1; *end = e+1; - } else if (fn) - // Look up variable. - return fn(data, str, end, res); - else + } else if (fn) { + // Try lookup as variable, if not as function + if (fn(data, str, end, res) == 0) + return 0; + else + return func_expr(filt, data, fn, str, end, res); + } else { return -1; + } } return 0; @@ -197,9 +319,9 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * mul_expr * : unary_expr ( - * unary_expr '*' unary_expr - * | unary_expr '/' unary_expr - * | unary_expr '%' unary_expr + * '*' unary_expr + * | '/' unary_expr + * | '%' unary_expr * )* */ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -238,8 +360,8 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * add_expr * : mul_expr ( - * mul_expr '+' mul_expr - * | mul_expr '-' mul_expr + * '+' mul_expr + * | '-' mul_expr * )* */ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -331,8 +453,8 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * bitor_expr - * : xor_expr - * | bitor_expr '|' xor_expr + * : bitxor_expr + * | bitor_expr '|' bitxor_expr */ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { diff --git a/htslib.mk b/htslib.mk index ceb9bf3c4..8cd659fc3 100644 --- a/htslib.mk +++ b/htslib.mk @@ -54,6 +54,7 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/hts.h \ $(HTSDIR)/htslib/hts_defs.h \ $(HTSDIR)/htslib/hts_endian.h \ + $(HTSDIR)/htslib/hts_expr.h \ $(HTSDIR)/htslib/hts_log.h \ $(HTSDIR)/htslib/hts_os.h \ $(HTSDIR)/htslib/kbitset.h \ @@ -92,6 +93,7 @@ HTSLIB_ALL = \ $(HTSDIR)/hfile_s3.c \ $(HTSDIR)/hfile_s3_write.c \ $(HTSDIR)/hts.c \ + $(HTSDIR)/hts_expr.c \ $(HTSDIR)/hts_internal.h \ $(HTSDIR)/hts_os.c \ $(HTSDIR)/kfunc.c \ diff --git a/htslib/hts.h b/htslib/hts.h index 8360e2cf1..f4e06efdd 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -34,7 +34,6 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_log.h" -#include "hts_expr.h" #include "kstring.h" #include "kroundup.h" @@ -225,6 +224,7 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; +struct hts_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -257,7 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; - hts_filter_t *filter; + struct hts_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index c628d2e6f..d66a8edd8 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -25,7 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTS_EXPR_H #define HTS_EXPR_H -#include +#include "kstring.h" #include "hts_defs.h" /// Holds a filter variable. This is also used to return the results. @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ * Take care when negating this. "[NM] != 0" will be true when * [NM] is absent, thus consider "[NM] && [NM] != 0". */ -typedef struct { +typedef struct hts_expr_val_t { char is_str; // Use .s vs .d char is_true; // Force true if even zero kstring_t s; // is_str and empty s permitted (eval as false) @@ -85,7 +85,7 @@ typedef int (hts_expr_sym_func)(void *data, char *str, char **end, * * The type of the returned result may be numeric of string, as defined by * the is_str member. It can also be explicitly defined to be true even - * for a null value. This may be used to check for the existance of + * for a null value. This may be used to check for the existence of * something, irrespective of whether that something evaluates to zero. */ HTSLIB_EXPORT diff --git a/htslib/sam.h b/htslib/sam.h index 3c7b9b693..97aa4acf2 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1408,6 +1408,9 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +// Forward declaration, see hts_expr.h for full. +struct hts_filter_t; + /// sam_passes_filter - Checks whether a record passes an hts_filter. /** @param h Pointer to the header structure previously read * @param b Pointer to the BAM record to be checked @@ -1415,7 +1418,8 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, * @return 1 if passes, 0 if not, and <0 on error. */ HTSLIB_EXPORT -int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt); +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, + struct hts_filter_t *filt); /************************************* *** Manipulating auxiliary fields *** diff --git a/htslib_vars.mk b/htslib_vars.mk index fa907f50f..3c2275d4a 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -29,10 +29,10 @@ htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h) htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_defs_h) $(htslib_hts_h) $(htslib_sam_h) htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) $(htslib_hts_h) htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) -htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) $(htslib_hts_expr_h) +htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h -htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h +htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h $(htslib_kstring_h) $(htslib_hts_defs_h) htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h diff --git a/sam.c b/sam.c index d044046eb..dbfc42e56 100644 --- a/sam.c +++ b/sam.c @@ -49,6 +49,7 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_internal.h" #include "htslib/hfile.h" #include "htslib/hts_endian.h" +#include "htslib/hts_expr.h" #include "header.h" #include "htslib/khash.h" @@ -1098,6 +1099,34 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_po return ret; } +// Internal (for now) func used by bam_sym_lookup. This is copied from +// samtools/bam.c. +static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) +{ + const char *rg; + kstring_t lib = { 0, 0, NULL }; + rg = (char *)bam_aux_get(b, "RG"); + + if (!rg) + return NULL; + else + rg++; + + if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) + return NULL; + + static char LB_text[1024]; + int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; + + memcpy(LB_text, lib.s, len); + LB_text[len] = 0; + + free(lib.s); + + return LB_text; +} + + // Bam record pointer and SAM header combined typedef struct { const sam_hdr_t *h; @@ -1196,6 +1225,16 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 'l': + if (memcmp(str, "library", 7) == 0) { + *end = str+7; + res->is_str = 1; + const char *lib = bam_get_library(hb->h, b); + kputs(lib ? lib : "", ks_clear(&res->s)); + return 0; + } + break; + case 'm': if (memcmp(str, "mapq", 4) == 0) { *end = str+4; @@ -1241,13 +1280,22 @@ static int bam_sym_lookup(void *data, char *str, char **end, case 'q': if (memcmp(str, "qlen", 4) == 0) { *end = str+4; - res->d = b->core.l_qseq; + res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); return 0; } else if (memcmp(str, "qname", 5) == 0) { *end = str+5; res->is_str = 1; kputs(bam_get_qname(b), ks_clear(&res->s)); return 0; + } else if (memcmp(str, "qual", 4) == 0) { + *end = str+4; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; } break; @@ -1275,6 +1323,20 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 's': + if (memcmp(str, "seq", 3) == 0) { + *end = str+3; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); + res->s.s[b->core.l_qseq] = 0; + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; + } + break; + case 't': if (memcmp(str, "tlen", 4) == 0) { *end = str+4; @@ -1362,7 +1424,7 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t { htsFile *fp = fpv; bam1_t *b = bv; - int filtered, ret; + int pass_filter, ret; do { ret = cram_get_bam_seq(fp->fp.cram, &b); @@ -1376,10 +1438,14 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg = b->core.pos; *end = bam_endpos(b); - filtered = sam_passes_filter(fp->bam_header, b, fp->filter); - if (filtered < 0) - return -2; - } while (filtered == 0); + if (fp->filter) { + pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); + if (pass_filter < 0) + return -2; + } else { + pass_filter = 1; + } + } while (pass_filter == 0); return ret; } @@ -3423,7 +3489,7 @@ int sam_set_threads(htsFile *fp, int nthreads) { } // Internal component of sam_read1 below -static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { +static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret = bam_read1(fp->fp.bgzf, b); if (h && ret >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || @@ -3436,7 +3502,7 @@ static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { } // Internal component of sam_read1 below -static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { +static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { int ret = cram_get_bam_seq(fp->fp.cram, b); if (ret < 0) return cram_eof(fp->fp.cram) ? -1 : -2; @@ -3448,7 +3514,7 @@ static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { } // Internal component of sam_read1 below -static int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { +static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret; // Consume 1st line after header parsing as it wasn't using peek @@ -3558,10 +3624,9 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) ret = sam_read1_cram(fp, h, &b); break; - case sam: { + case sam: ret = sam_read1_sam(fp, h, b); break; - } case empty_format: errno = EPIPE; diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index 4831c8df8..effb77a26 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -37,6 +37,9 @@ P string1.out $tv -i 'filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam P string2.out $tv -i 'filter=rname=="CHROMOSOME_II"' ../ce#5b.sam P string3.out $tv -i 'filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam P string4.out $tv -i 'filter=cigar=~"D"' ../ce#1000.sam +P string5.out $tv -i 'filter=seq =~ "(AT){2}"' ../ce#1000.sam +P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam +P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam # Integer ops P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' @@ -44,3 +47,9 @@ P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv ' # Aux tags P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' + +# Functions. +P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' +P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' +P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' +P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' diff --git a/test/sam_filter/func1.out b/test/sam_filter/func1.out new file mode 100644 index 000000000..d00491fd7 --- /dev/null +++ b/test/sam_filter/func1.out @@ -0,0 +1 @@ +1 diff --git a/test/sam_filter/func2.out b/test/sam_filter/func2.out new file mode 100644 index 000000000..d81cc0710 --- /dev/null +++ b/test/sam_filter/func2.out @@ -0,0 +1 @@ +42 diff --git a/test/sam_filter/func3.out b/test/sam_filter/func3.out new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/test/sam_filter/func3.out @@ -0,0 +1 @@ +2 diff --git a/test/sam_filter/func4.out b/test/sam_filter/func4.out new file mode 100644 index 000000000..103a99dd8 --- /dev/null +++ b/test/sam_filter/func4.out @@ -0,0 +1 @@ +604 diff --git a/test/sam_filter/string5.out b/test/sam_filter/string5.out new file mode 100644 index 000000000..4be462117 --- /dev/null +++ b/test/sam_filter/string5.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.9154510 16 CHROMOSOME_I 56 0 100M * 0 0 TTCATATGGGCAGGGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA ##############################@B;@?>>B1?BCBBC@>CDB>B@CA@CCAC=AA>>AC;CCACCCCCCC=CCCCCCCCCCCCBCCCCCCCC AS:i:-28 XS:i:-28 XN:i:0 XM:i:14 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0C0T1A0G0C0C0T0A0A0G0C0C0T85 NM:i:14 diff --git a/test/sam_filter/string6.out b/test/sam_filter/string6.out new file mode 100644 index 000000000..c2766bd73 --- /dev/null +++ b/test/sam_filter/string6.out @@ -0,0 +1,9 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +b1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x2 +b2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x2 diff --git a/test/sam_filter/string7.out b/test/sam_filter/string7.out new file mode 100644 index 000000000..8efe886b3 --- /dev/null +++ b/test/sam_filter/string7.out @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +a1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x1 +c1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** +a2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x1 +c2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** diff --git a/test/test_expr.c b/test/test_expr.c index 258548b88..606a9b3b5 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -158,7 +158,7 @@ int test(void) { { 1, NULL, " (2*3)&7 > 4"}, { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv - { 1, NULL, "((2*3)&7) > 4"}, // Python precendece equiv + { 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, { 1, "plugh", "magic"}, From cecf738a4a200547250fd9b22e4365af1a9610fb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Jan 2021 09:37:05 +0000 Subject: [PATCH 033/114] Remove knet. (PR #1200) There's little benefit to keeping it (basically FTP and unsecured http), while curl is already required for htsget, refget (CRAM ref cache), AWS S3 and GCS protocols. Curl is available on all platforms we support. The public API in htslib/knetfile.h is retained for compatibility. The knet_* functions have been replaced by stubs that use hFILE internally. htslib/knetfile.h has HTS_DEPRECATED markers for these functions, so anyone compiling against the new code will get a warning that we intend to delete them in the future. The knet_tell macro does direct access of knetFile::offset so we cannot simply wrap up htell as we'd like. Similarly knet_fileno directly queries knetFile::fd. However the only use I've found of this was in old copies of bgzf and bcftools main which then did fstat on it so it's only filled out for local files. (That's good because getting it out of curl isn't supported by our code.) --- INSTALL | 10 +- Makefile | 7 +- NEWS | 6 + hfile.c | 80 +++- hfile_internal.h | 3 - hfile_libcurl.c | 2 +- hfile_net.c | 112 ------ htslib.mk | 3 - htslib/knetfile.h | 62 ++- htslib_vars.mk | 1 - knetfile.c | 645 -------------------------------- test/header_syms.pl | 7 +- test/maintainer/check_spaces.pl | 2 - 13 files changed, 125 insertions(+), 815 deletions(-) delete mode 100644 hfile_net.c delete mode 100644 knetfile.c diff --git a/INSTALL b/INSTALL index 3ee36d0cf..7ae91da33 100644 --- a/INSTALL +++ b/INSTALL @@ -41,9 +41,10 @@ a development ('-dev' or '-devel') package separate from the main library. Disabling libbzip2 and liblzma will make some CRAM files unreadable, so is not recommended. -Using libcurl provides HTSlib with better network protocol support, for -example it enables the use of https:// URLs. It is also required if -direct access to Amazon S3 or Google Cloud Storage is enabled. +Using libcurl provides HTSlib with network protocol support, for +example it enables the use of ftp://, http://, and https:// URLs. +It is also required if direct access to Amazon S3 or Google Cloud +Storage is enabled. Amazon S3 support requires an HMAC function to calculate a message authentication code. On MacOS, the CCHmac function from the standard @@ -131,8 +132,7 @@ various features and specify further optional external requirements: --enable-libcurl Use libcurl () to implement network access to - remote files via FTP, HTTP, HTTPS, etc. By default, HTSlib uses its - own simple networking code to provide access via FTP and HTTP only. + remote files via FTP, HTTP, HTTPS, etc. --enable-gcs Implement network access to Google Cloud Storage. By default or with diff --git a/Makefile b/Makefile index 840d9a2f8..eac4b809d 100644 --- a/Makefile +++ b/Makefile @@ -144,7 +144,6 @@ show-version: LIBHTS_OBJS = \ kfunc.o \ - knetfile.o \ kstring.o \ bcf_sr_sort.o \ bgzf.o \ @@ -152,7 +151,6 @@ LIBHTS_OBJS = \ faidx.o \ header.o \ hfile.o \ - hfile_net.o \ hts.o \ hts_expr.o \ hts_os.o\ @@ -334,12 +332,10 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) -knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) @@ -544,8 +540,7 @@ test-shlib-exports: header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt ( echo "Error: Found unexported symbols (listed above)" ; false ) # Extract symbols that should be exported from public headers using ctags -# Filter out macros in htslib/hts_defs.h, and knet_win32_ functions that -# aren't needed on non-Windows platforms. +# Filter out macros in htslib/hts_defs.h. header-exports.txt: test/header_syms.pl htslib/*.h test/header_syms.pl htslib/*.h | sort -u -o $@ diff --git a/NEWS b/NEWS index 2fd012bf9..658b7cad2 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,13 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* The knet networking code has been removed. It only supported the http + and ftp protocols, and a better and safer alternative using libcurl + has been available since release 1.3. If you need access to ftp:// and + http:// URLs, HTSlib should be built with libcurl support. (#1200) +* The old htslib/knetfile.h interfaces have been marked as deprecated. Any + code still using them should be updated to use hFILE instead. (#1200) Noteworthy changes in release 1.11 (22nd September 2020) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/hfile.c b/hfile.c index 8bbba5cf9..218520a7c 100644 --- a/hfile.c +++ b/hfile.c @@ -1061,7 +1061,6 @@ static int load_hfile_plugins() hfile_add_scheme_handler("data", &data); hfile_add_scheme_handler("file", &file); hfile_add_scheme_handler("preload", &preload); - init_add_plugin(NULL, hfile_plugin_init_net, "knetfile"); init_add_plugin(NULL, hfile_plugin_init_mem, "mem"); init_add_plugin(NULL, hfile_plugin_init_crypt4gh_needed, "crypt4gh-needed"); @@ -1213,3 +1212,82 @@ char *haddextension(struct kstring_t *buffer, const char *filename, kputs(trailing, buffer) >= 0) return buffer->s; else return NULL; } + + +/* + * ---------------------------------------------------------------------- + * Minimal stub functions for knet, added after the removal of + * hfile_net.c and knetfile.c. + * + * They exist purely for ABI compatibility, but are simply wrappers to + * hFILE. API should be compatible except knet_fileno (unused?). + * + * CULL THESE and knetfile.h at the next .so version bump. + */ +typedef struct knetFile_s { + // As per htslib/knetfile.h. Duplicated here as we don't wish to + // have any dependence on the deprecated knetfile.h interface, plus + // it's hopefully only temporary. + int type, fd; + int64_t offset; + char *host, *port; + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; + int64_t file_size; + char *path, *http_host; + + // Our local addition + hFILE *hf; +} knetFile; + +HTSLIB_EXPORT +knetFile *knet_open(const char *fn, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hopen(fn, mode))) { + free(fp); + fp = NULL; + } + + // FD backend is the only one implementing knet_fileno + fp->fd = fp->hf->backend == &fd_backend + ? ((hFILE_fd *)fp->hf)->fd + : -1; + + return fp; +} + +HTSLIB_EXPORT +knetFile *knet_dopen(int fd, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hdopen(fd, mode))) { + free(fp); + fp = NULL; + } + fp->fd = fd; + return fp; +} + +HTSLIB_EXPORT +ssize_t knet_read(knetFile *fp, void *buf, size_t len) { + ssize_t r = hread(fp->hf, buf, len); + fp->offset += r>0?r:0; + return r; +} + +HTSLIB_EXPORT +off_t knet_seek(knetFile *fp, off_t off, int whence) { + off_t r = hseek(fp->hf, off, whence); + if (r >= 0) + fp->offset = r; + return r; +} + +HTSLIB_EXPORT +int knet_close(knetFile *fp) { + int r = hclose(fp->hf); + free(fp); + return r; +} diff --git a/hfile_internal.h b/hfile_internal.h index 386689626..70cc99c57 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -179,9 +179,6 @@ extern int hfile_plugin_init_s3(struct hFILE_plugin *self); extern int hfile_plugin_init_s3_write(struct hFILE_plugin *self); #endif -/* This one is never built as a separate plugin. */ -extern int hfile_plugin_init_net(struct hFILE_plugin *self); - // Callback to allow headers to be set in http connections. Currently used // to allow s3 to renew tokens when seeking. Kept internal for now, // although we may consider exposing it in the API later. diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 090db348a..2de7ccbd9 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1221,7 +1221,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) err = curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp); // Avoid many repeated CWD calls with FTP, instead requesting the filename - // by full path (as done in knet, but not strictly compliant with RFC1738). + // by full path (but not strictly compliant with RFC1738). err |= curl_easy_setopt(fp->easy, CURLOPT_FTP_FILEMETHOD, CURLFTPMETHOD_NOCWD); if (mode == 'r') { diff --git a/hfile_net.c b/hfile_net.c deleted file mode 100644 index 5443b2240..000000000 --- a/hfile_net.c +++ /dev/null @@ -1,112 +0,0 @@ -/* hfile_net.c -- network backend for low-level input/output streams. - - Copyright (C) 2013-2015 Genome Research Ltd. - - Author: John Marshall - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include - -#include "hfile_internal.h" - -#include "htslib/knetfile.h" - -typedef struct { - hFILE base; - knetFile *netfp; -} hFILE_net; - -static int net_inited = 0; - -#ifdef _WIN32 -static void net_exit(void) -{ - knet_win32_destroy(); -} -#endif - -static int net_init(void) -{ -#ifdef _WIN32 - if (knet_win32_init() != 0) return -1; - - // In the unlikely event atexit() fails, it's better to succeed here and - // carry on and do the I/O; then eventually when the program exits, we'll - // merely have failed to clean up properly, as if we had aborted. - (void) atexit(net_exit); -#endif - - net_inited = 1; - return 0; -} - -static ssize_t net_read(hFILE *fpv, void *buffer, size_t nbytes) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_read(fp->netfp, buffer, nbytes); -} - -static off_t net_seek(hFILE *fpv, off_t offset, int whence) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_seek(fp->netfp, offset, whence); -} - -static int net_close(hFILE *fpv) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_close(fp->netfp); -} - -static const struct hFILE_backend net_backend = -{ - net_read, NULL, net_seek, NULL, net_close -}; - -hFILE *hopen_net(const char *filename, const char *mode) -{ - hFILE_net *fp; - - // Do any networking initialisation if this is the first use. - if (! net_inited) { if (net_init() < 0) return NULL; } - - fp = (hFILE_net *) hfile_init(sizeof (hFILE_net), mode, 0); - if (fp == NULL) return NULL; - - fp->netfp = knet_open(filename, mode); - if (fp->netfp == NULL) { hfile_destroy((hFILE *) fp); return NULL; } - - fp->base.backend = &net_backend; - return &fp->base; -} - -int hfile_plugin_init_net(struct hFILE_plugin *self) -{ - static const struct hFILE_scheme_handler handler = - { hopen_net, hfile_always_remote, "knetfile", 0 }; - - self->name = "knetfile"; - hfile_add_scheme_handler("http", &handler); - hfile_add_scheme_handler("ftp", &handler); - return 0; -} diff --git a/htslib.mk b/htslib.mk index 8cd659fc3..254c25797 100644 --- a/htslib.mk +++ b/htslib.mk @@ -62,7 +62,6 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/khash.h \ $(HTSDIR)/htslib/khash_str2int.h \ $(HTSDIR)/htslib/klist.h \ - $(HTSDIR)/htslib/knetfile.h \ $(HTSDIR)/htslib/kseq.h \ $(HTSDIR)/htslib/ksort.h \ $(HTSDIR)/htslib/kstring.h \ @@ -89,7 +88,6 @@ HTSLIB_ALL = \ $(HTSDIR)/hfile.c \ $(HTSDIR)/hfile_gcs.c \ $(HTSDIR)/hfile_libcurl.c \ - $(HTSDIR)/hfile_net.c \ $(HTSDIR)/hfile_s3.c \ $(HTSDIR)/hfile_s3_write.c \ $(HTSDIR)/hts.c \ @@ -97,7 +95,6 @@ HTSLIB_ALL = \ $(HTSDIR)/hts_internal.h \ $(HTSDIR)/hts_os.c \ $(HTSDIR)/kfunc.c \ - $(HTSDIR)/knetfile.c \ $(HTSDIR)/kstring.c \ $(HTSDIR)/md5.c \ $(HTSDIR)/multipart.c \ diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 87fba4adc..598ed379a 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -50,19 +50,20 @@ #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 +// Kept for API/ABI compatability only. Do not use directly! typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) @@ -72,35 +73,30 @@ typedef struct knetFile_s { extern "C" { #endif -#ifdef _WIN32 - int knet_win32_init(); - void knet_win32_destroy(); -#endif - HTSLIB_EXPORT - knetFile *knet_open(const char *fn, const char *mode); + knetFile *knet_open(const char *fn, const char *mode) HTS_DEPRECATED("Please use hopen instead"); - /* - This only works with local files. - */ + /* + This only works with local files. + */ HTSLIB_EXPORT - knetFile *knet_dopen(int fd, const char *mode); + knetFile *knet_dopen(int fd, const char *mode) HTS_DEPRECATED("Please use hdopen instead"); - /* - If ->is_ready==0, this routine updates ->fd; otherwise, it simply - reads from ->fd. - */ + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ HTSLIB_EXPORT - ssize_t knet_read(knetFile *fp, void *buf, size_t len); + ssize_t knet_read(knetFile *fp, void *buf, size_t len) HTS_DEPRECATED("Please use hread instead"); - /* - This routine only sets ->offset and ->is_ready=0. It does not - communicate with the FTP server. - */ + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ HTSLIB_EXPORT - off_t knet_seek(knetFile *fp, off_t off, int whence); + off_t knet_seek(knetFile *fp, off_t off, int whence) HTS_DEPRECATED("Please use hseek instead"); HTSLIB_EXPORT - int knet_close(knetFile *fp); + int knet_close(knetFile *fp) HTS_DEPRECATED("Please use hclose instead"); #ifdef __cplusplus } diff --git a/htslib_vars.mk b/htslib_vars.mk index 3c2275d4a..85835f872 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -40,7 +40,6 @@ htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h $(htslib_hts_defs_h) htslib_khash_h = $(HTSPREFIX)htslib/khash.h $(htslib_kstring_h) $(htslib_kroundup_h) htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h -htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h $(htslib_hts_defs_h) htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h diff --git a/knetfile.c b/knetfile.c deleted file mode 100644 index f0a608ea4..000000000 --- a/knetfile.c +++ /dev/null @@ -1,645 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2012-2014, 2017 Genome Research Ltd (GRL). - 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Probably I will not do socket programming in the next few years and - therefore I decide to heavily annotate this file, for Linux and - Windows as well. -ac */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef _WIN32 -#include -#include -#include -#include -#endif - -#include "htslib/knetfile.h" -#include "htslib/hts_log.h" - -/* In winsock.h, the type of a socket is SOCKET, which is: "typedef - * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed - * integer -1. In knetfile.c, I use "int" for socket type - * throughout. This should be improved to avoid confusion. - * - * In Linux/Mac, recv() and read() do almost the same thing. You can see - * in the header file that netread() is simply an alias of read(). In - * Windows, however, they are different and using recv() is mandatory. - */ - -/* This function tests if the file handler is ready for reading (or - * writing if is_read==0). */ -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); -#ifndef _WIN32 - if (ret == -1) perror("select"); -#else - if (ret == 0) - hts_log_warning("Select timed out"); - else if (ret == SOCKET_ERROR) - hts_log_error("Select returned error %d", WSAGetLastError()); -#endif - return ret; -} - -#ifndef _WIN32 -/* This function does not work with Windows due to the lack of - * getaddrinfo() in winsock. It is addapted from an example in "Beej's - * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) - - int ai_err, on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - /* In Unix/Mac, getaddrinfo() is the most convenient way to get - * server information. */ - if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { hts_log_error("Can't resolve %s:%s: %s", host, port, gai_strerror(ai_err)); return -1; } - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - /* The following two setsockopt() are used by ftplib - * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they - * necessary. */ - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -} -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -#else -/* MinGW's printf has problem with "%lld" */ -char *int64tostr(char *buf, int64_t x) -{ - int cnt; - int i = 0; - do { - buf[i++] = '0' + x % 10; - x /= 10; - } while (x); - buf[i] = 0; - for (cnt = i, i = 0; i < cnt/2; ++i) { - int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; - } - return buf; -} - -int64_t strtoint64(const char *buf) -{ - int64_t x; - for (x = 0; *buf != '\0'; ++buf) - x = x * 10 + ((int64_t) *buf - 48); - return x; -} -/* In windows, the first thing is to establish the TCP connection. */ -int knet_win32_init() -{ - WSADATA wsaData; - return WSAStartup(MAKEWORD(2, 2), &wsaData); -} -void knet_win32_destroy() -{ - WSACleanup(); -} -/* A slightly modfied version of the following function also works on - * Mac (and presummably Linux). However, this function is not stable on - * my Mac. It sometimes works fine but sometimes does not. Therefore for - * non-Windows OS, I do not use this one. */ -static SOCKET socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) \ - do { \ - hts_log_error("The %s operation returned error %d", func, WSAGetLastError()); \ - return -1; \ - } while (0) - - int on = 1; - SOCKET fd; - struct linger lng = { 0, 0 }; - struct sockaddr_in server; - struct hostent *hp = 0; - // open socket - if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - // get host info - if (isalpha(host[0])) hp = gethostbyname(host); - else { - struct in_addr addr; - addr.s_addr = inet_addr(host); - hp = gethostbyaddr((char*)&addr, 4, AF_INET); - } - if (hp == 0) __err_connect("gethost"); - // connect - server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); - server.sin_family= AF_INET; - server.sin_port = htons(atoi(port)); - if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); - // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) - return fd; -} -#endif - -static off_t my_netread(int fd, void *buf, off_t len) -{ - off_t rest = len, curr, l = 0; - /* recv() and read() may not read the required length of data with - * one call. They have to be called repeatedly. */ - while (rest) { - if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading - curr = netread(fd, (void*)((char*)buf + l), rest); - /* According to the glibc manual, section 13.2, a zero returned - * value indicates end-of-file (EOF), which should mean that - * read() will not return zero if EOF has not been met but data - * are not immediately available. */ - if (curr == 0) break; - l += curr; rest -= curr; - } - return l; -} - -/************************* - * FTP specific routines * - *************************/ - -static int kftp_get_response(knetFile *ftp) -{ -#ifndef _WIN32 - unsigned char c; -#else - char c; -#endif - int n = 0; - char *p; - if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; - while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - //fputc(c, stderr); - if (n >= ftp->max_response) { - ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; - ftp->response = (char*)realloc(ftp->response, ftp->max_response); - } - ftp->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit((int)((unsigned char) ftp->response[0])) && isdigit((int)((unsigned char) ftp->response[1])) && isdigit((int)((unsigned char) ftp->response[2])) - && ftp->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - ftp->response[n-2] = 0; - return strtol(ftp->response, &p, 0); -} - -static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) -{ - if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - int len = strlen(cmd); - if ( netwrite(ftp->ctrl_fd, cmd, len) != len ) return -1; - return is_get? kftp_get_response(ftp) : 0; -} - -static int kftp_pasv_prep(knetFile *ftp) -{ - char *p; - int v[6]; - kftp_send_cmd(ftp, "PASV\r\n", 1); - for (p = ftp->response; *p && *p != '('; ++p); - if (*p != '(') return -1; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); - ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; - return 0; -} - - -static int kftp_pasv_connect(knetFile *ftp) -{ - char host[80], port[10]; - if (ftp->pasv_port == 0) { - hts_log_error("Must call kftp_pasv_prep() first"); - return -1; - } - sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); - sprintf(port, "%d", ftp->pasv_port); - ftp->fd = socket_connect(host, port); - if (ftp->fd == -1) return -1; - return 0; -} - -int kftp_connect(knetFile *ftp) -{ - ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); - if (ftp->ctrl_fd == -1) return -1; - kftp_get_response(ftp); - kftp_send_cmd(ftp, "USER anonymous\r\n", 1); - kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); - kftp_send_cmd(ftp, "TYPE I\r\n", 1); - return 0; -} - -int kftp_reconnect(knetFile *ftp) -{ - if (ftp->ctrl_fd != -1) { - netclose(ftp->ctrl_fd); - ftp->ctrl_fd = -1; - } - netclose(ftp->fd); - ftp->fd = -1; - return kftp_connect(ftp); -} - -// initialize ->type, ->host, ->retr and ->size -knetFile *kftp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p; - int l; - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_FTP; - fp->fd = -1; - /* the Linux/Mac version of socket_connect() also recognizes a port - * like "ftp", but the Windows version does not. */ - fp->port = strdup("21"); - fp->host = (char*)calloc(l + 1, 1); - if (strchr(mode, 'c')) fp->no_reconnect = 1; - strncpy(fp->host, fn + 6, l); - fp->retr = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->retr, "RETR %s\r\n", p); - fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->size_cmd, "SIZE %s\r\n", p); - fp->seek_offset = 0; - return fp; -} -// place ->fd at offset off -int kftp_connect_file(knetFile *fp) -{ - int ret; - long long file_size; - if (fp->fd != -1) { - netclose(fp->fd); - if (fp->no_reconnect) kftp_get_response(fp); - } - kftp_pasv_prep(fp); - kftp_send_cmd(fp, fp->size_cmd, 1); -#ifndef _WIN32 - // If the file does not exist, the response will be "550 Could not get file - // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi. - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1; -#else - const char *p = fp->response; - while (*p != ' ') ++p; - while (*p < '0' || *p > '9') ++p; - file_size = strtoint64(p); -#endif - fp->file_size = file_size; - if (fp->offset>=0) { - char tmp[32]; -#ifndef _WIN32 - sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); -#else - strcpy(tmp, "REST "); - int64tostr(tmp + 5, fp->offset); - strcat(tmp, "\r\n"); -#endif - kftp_send_cmd(fp, tmp, 1); - } - kftp_send_cmd(fp, fp->retr, 0); - kftp_pasv_connect(fp); - ret = kftp_get_response(fp); - if (ret != 150) { - hts_log_error("%s", fp->response); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - fp->is_ready = 1; - return 0; -} - - -/************************** - * HTTP specific routines * - **************************/ - -knetFile *khttp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p, *proxy, *q; - int l; - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->http_host = (char*)calloc(l + 1, 1); - strncpy(fp->http_host, fn + 7, l); - fp->http_host[l] = 0; - for (q = fp->http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set ->host, ->port and ->path - if (proxy == 0) { - fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. - fp->port = strdup(*q? q : "80"); - fp->path = strdup(*p? p : "/"); - } else { - fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = fp->host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - fp->port = strdup(*q? q : "80"); - fp->path = strdup(fn); - } - fp->type = KNF_TYPE_HTTP; - fp->ctrl_fd = fp->fd = -1; - fp->seek_offset = 0; - return fp; -} - -int khttp_connect_file(knetFile *fp) -{ - int ret, l = 0; - char *buf, *p; - if (fp->fd != -1) netclose(fp->fd); - fp->fd = socket_connect(fp->host, fp->port); - buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); - if (fp->offset != 0) l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); - l += sprintf(buf + l, "\r\n"); - if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; } - l = 0; - while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - buf[l] = 0; - if (l < 14) { // prematured header - free(buf); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file - off_t rest = fp->offset; - while (rest) { - off_t l = rest < 0x10000? rest : 0x10000; - rest -= my_netread(fp->fd, buf, l); - } - } else if (ret != 206 && ret != 200) { - // failed to open file - free(buf); - netclose(fp->fd); - switch (ret) { - case 401: errno = EPERM; break; - case 403: errno = EACCES; break; - case 404: errno = ENOENT; break; - case 407: errno = EPERM; break; - case 408: errno = ETIMEDOUT; break; - case 410: errno = ENOENT; break; - case 503: errno = EAGAIN; break; - case 504: errno = ETIMEDOUT; break; - default: errno = (ret >= 400 && ret < 500)? EINVAL : EIO; break; - } - fp->fd = -1; - return -1; - } - free(buf); - fp->is_ready = 1; - return 0; -} - -/******************** - * Generic routines * - ********************/ - -knetFile *knet_open(const char *fn, const char *mode) -{ - knetFile *fp = 0; - if (mode[0] != 'r') { - hts_log_error("Only mode \"r\" is supported"); - errno = ENOTSUP; - return 0; - } - if (strstr(fn, "ftp://") == fn) { - fp = kftp_parse_url(fn, mode); - if (fp == 0) return 0; - if (kftp_connect(fp) == -1) { - knet_close(fp); - return 0; - } - kftp_connect_file(fp); - } else if (strstr(fn, "http://") == fn) { - fp = khttp_parse_url(fn, mode); - if (fp == 0) return 0; - khttp_connect_file(fp); - } else { // local file -#ifdef _WIN32 - /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may - * be undefined on some systems, although it is defined on my - * Mac and the Linux I have tested on. */ - int fd = open(fn, O_RDONLY | O_BINARY); -#else - int fd = open(fn, O_RDONLY); -#endif - if (fd == -1) { - perror("open"); - return 0; - } - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - fp->ctrl_fd = -1; - } - if (fp && fp->fd == -1) { - knet_close(fp); - return 0; - } - return fp; -} - -knetFile *knet_dopen(int fd, const char *mode) -{ - knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - return fp; -} - -ssize_t knet_read(knetFile *fp, void *buf, size_t len) -{ - off_t l = 0; - if (fp->fd == -1) return 0; - if (fp->type == KNF_TYPE_FTP) { - if (fp->is_ready == 0) { - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp); - } - } else if (fp->type == KNF_TYPE_HTTP) { - if (fp->is_ready == 0) - khttp_connect_file(fp); - } - if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX - size_t rest = len; - ssize_t curr; - while (rest) { - do { - curr = read(fp->fd, (void*)((char*)buf + l), rest); - } while (curr < 0 && EINTR == errno); - if (curr < 0) return -1; - if (curr == 0) break; - l += curr; rest -= curr; - } - } else l = my_netread(fp->fd, buf, len); - fp->offset += l; - return l; -} - -off_t knet_seek(knetFile *fp, off_t off, int whence) -{ - if (whence == SEEK_SET && off == fp->offset) return 0; - if (fp->type == KNF_TYPE_LOCAL) { - /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ - off_t offset = lseek(fp->fd, off, whence); - if (offset == -1) return -1; - fp->offset = offset; - return fp->offset; - } else if (fp->type == KNF_TYPE_FTP) { - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else if (whence == SEEK_END) fp->offset = fp->file_size + off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } else if (fp->type == KNF_TYPE_HTTP) { - if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? - hts_log_error("SEEK_END is not supported for HTTP. Offset is unchanged"); - errno = ESPIPE; - return -1; - } - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } - errno = EINVAL; - hts_log_error("%s", strerror(errno)); - return -1; -} - -int knet_close(knetFile *fp) -{ - if (fp == 0) return 0; - if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific - if (fp->fd != -1) { - /* On Linux/Mac, netclose() is an alias of close(), but on - * Windows, it is an alias of closesocket(). */ - if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); - else netclose(fp->fd); - } - free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific - free(fp->path); free(fp->http_host); // HTTP specific - free(fp->size_cmd); - free(fp); - return 0; -} - -#ifdef KNETFILE_MAIN -int main(void) -{ - char *buf; - knetFile *fp; - int type = 4, l; -#ifdef _WIN32 - knet_win32_init(); -#endif - buf = calloc(0x100000, 1); - if (type == 0) { - fp = knet_open("knetfile.c", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 1) { // NCBI FTP, large file - fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); - knet_seek(fp, 2500000000ll, SEEK_SET); - l = knet_read(fp, buf, 255); - } else if (type == 2) { - fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 3) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 4) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); - knet_read(fp, buf, 10000); - knet_seek(fp, 20000, SEEK_SET); - knet_seek(fp, 10000, SEEK_SET); - l = knet_read(fp, buf+10000, 10000000) + 10000; - } - if (type != 4 && type != 1) { - knet_read(fp, buf, 255); - buf[255] = 0; - printf("%s\n", buf); - } else write(fileno(stdout), buf, l); - knet_close(fp); - free(buf); - return 0; -} -#endif diff --git a/test/header_syms.pl b/test/header_syms.pl index fc9cfa303..fe5128a78 100755 --- a/test/header_syms.pl +++ b/test/header_syms.pl @@ -31,7 +31,7 @@ # Roughly equivalent Exuberant-ctags command is: # ctags -f - -n -I HTS_RESULT_USED -I HTS_DEPRECATED+ -I HTS_FORMAT+ \ -# -I KS_ATTR_PRINTF+ -I knet_win32_destroy+ -I knet_win32_init+ +# -I KS_ATTR_PRINTF+ # Unfortunately this is not the default ctags on all platforms, hence this # script. @@ -45,8 +45,9 @@ GetOptions('show-processed' => \$show_processed); -# List of functions to strip from the output -my %ignore = map { $_ => 1 } qw(knet_win32_init knet_win32_destroy); +# List of functions to strip from the output. Currently empty, +# but this functionality is retained for potential future use. +my %ignore = map { $_ => 1 } qw( ); foreach my $file (@ARGV) { extract_symbols($file, $show_processed, \%ignore); diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index 0daf24406..d5a53b1ed 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -45,14 +45,12 @@ sub check { my %allow_tabs = map { ("$root/$_", 1) } ( 'kfunc.c', -'knetfile.c', 'kstring.c', 'md5.c', 'htslib/khash.h', 'htslib/kseq.h', 'htslib/ksort.h', 'htslib/kstring.h', -'htslib/knetfile.h', 'htslib/klist.h', 'htslib/kbitset.h', 'os/rand.c', From a7f7abcb3e21646f15d4cb94d07920ddb828cf36 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 23 Sep 2019 16:19:36 +0100 Subject: [PATCH 034/114] Bug fix CRAM 2.1 output. In tested I spotted v2.1 output still uses the rANS codec, which is incorrect. It does this because rans is enabled when opening a file if the version is >= 3.0, but we set the version after opening the file and that didn't explicitly disable rans again. Io_lib's Scramble does these bits in reverse (set default vers and then open file) so worked fine. --- cram/cram_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 1904fa6cb..0a6814c9b 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -4890,8 +4890,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } fd->version = major*256 + minor; - if (CRAM_MAJOR_VERS(fd->version) >= 3) - fd->use_rans = 1; + fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; + break; } From f5203fc035aab279d29ed3054664429adefc7b1d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 16 Sep 2019 14:39:58 +0100 Subject: [PATCH 035/114] Use htscodecs as a submodule. The cram/rANS_static.[ch] implementation has been removed. Updates the build system to make the required parts of htscodecs. Co-Authored-By: Rob Davies --- .appveyor.yml | 1 + .cirrus.yml | 5 +- .gitignore | 1 + .gitmodules | 4 + .travis.yml | 1 + INSTALL | 14 + Makefile | 19 +- configure.ac | 35 +- cram/cram_io.c | 7 +- cram/rANS_byte.h | 352 ----------- cram/rANS_static.c | 912 ----------------------------- cram/rANS_static.h | 51 -- htscodecs | 1 + htscodecs.mk.in | 5 + htscodecs_bundled.mk | 3 + htscodecs_external.mk | 6 + htslib.mk | 13 +- test/maintainer/check_copyright.pl | 3 + test/maintainer/check_spaces.pl | 3 + 19 files changed, 107 insertions(+), 1329 deletions(-) create mode 100644 .gitmodules delete mode 100644 cram/rANS_byte.h delete mode 100644 cram/rANS_static.c delete mode 100644 cram/rANS_static.h create mode 160000 htscodecs create mode 100644 htscodecs.mk.in create mode 100644 htscodecs_bundled.mk create mode 100644 htscodecs_external.mk diff --git a/.appveyor.yml b/.appveyor.yml index 6f94944bc..f8944daef 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -33,6 +33,7 @@ build_script: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% + - git submodule update --init --recursive - "sh -lc \"autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: diff --git a/.cirrus.yml b/.cirrus.yml index 6b5ff0b98..d4256b11a 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -25,6 +25,7 @@ libdeflate_template: &LIBDEFLATE compile_template: &COMPILE compile_script: | + git submodule update --init --recursive if test "x$USE_LIBDEFLATE" = "xyes"; then CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' else @@ -132,7 +133,7 @@ centos_task: install_script: | yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ - diffutils + diffutils git << : *COMPILE << : *TEST @@ -158,7 +159,7 @@ macosx_task: USE_LIBDEFLATE: yes package_install_script: - - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz + - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git << : *LIBDEFLATE << : *COMPILE diff --git a/.gitignore b/.gitignore index 76be407b5..0855b6a88 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ hfile_*.so hts-object-files htslib_static.mk +htscodecs.mk cyg*.dll lib*.a diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..cb6f98d42 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "htscodecs"] + path = htscodecs + url = https://github.com/samtools/htscodecs.git + fetchRecurseSubmodules = true diff --git a/.travis.yml b/.travis.yml index 9c604d52c..d85224133 100644 --- a/.travis.yml +++ b/.travis.yml @@ -83,6 +83,7 @@ before_script: popd && \ popd fi + git submodule update --init --recursive script: - | diff --git a/INSTALL b/INSTALL index 7ae91da33..92a358a73 100644 --- a/INSTALL +++ b/INSTALL @@ -58,6 +58,20 @@ test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" Whilst the code may work on Windows with other environments, these have not be verified. +Update htscodecs submodule +========================== + +Note that this section only applies to git checkouts. If you're building +from a release tar file, you can skip this section. + +Some parts of HTSlib are provided by the external "htscodecs" project. This +is included as a submodule. When building from the git repository, +either clone the project using "git clone -r", or run: + + git submodule update --init --recursive + +to ensure the correct version of the submodule is present. + Building Configure ================== diff --git a/Makefile b/Makefile index eac4b809d..53a39d13a 100644 --- a/Makefile +++ b/Makefile @@ -106,8 +106,15 @@ all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ ALL_CPPFLAGS = -I. $(CPPFLAGS) +# Usually htscodecs.mk is generated by running configure or config.status, +# but if those aren't used create a default here. +htscodecs.mk: + echo '# Default htscodecs.mk generated by Makefile' > $@ + echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ + HTSPREFIX = include htslib_vars.mk +include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh # into here. @@ -178,8 +185,8 @@ LIBHTS_OBJS = \ cram/mFILE.o \ cram/open_trace_file.o \ cram/pooled_alloc.o \ - cram/rANS_static.o \ cram/string_alloc.o \ + $(HTSCODECS_OBJS) \ $(NONCONFIGURE_OBJS) # Without configure we wish to have a rich set of default figures, @@ -205,6 +212,9 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) +htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h + +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set @@ -364,15 +374,15 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) -cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread @@ -605,6 +615,7 @@ testclean: mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files clean: mostlyclean clean-$(SHLIB_FLAVOUR) @@ -612,7 +623,7 @@ clean: mostlyclean clean-$(SHLIB_FLAVOUR) distclean maintainer-clean: clean -rm -f config.cache config.h config.log config.mk config.status - -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk + -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk htscodecs.mk -rm -rf autom4te.cache clean-so: diff --git a/configure.ac b/configure.ac index e5e64496b..890d7aa31 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2019 Genome Research Ltd. +# Copyright (C) 2015-2020 Genome Research Ltd. # # Author: John Marshall # @@ -34,7 +34,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2018 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) @@ -108,6 +108,12 @@ AC_ARG_ENABLE([plugins], [], [enable_plugins=no]) AC_SUBST(enable_plugins) +AC_ARG_WITH([external-htscodecs], + [AS_HELP_STRING([--with-external-htscodecs], + [get htscodecs functions from a shared library])], + [], [with_external_htscodecs=no]) +AC_SUBST(with_external_htscodecs) + AC_ARG_WITH([libdeflate], [AS_HELP_STRING([--with-libdeflate], [use libdeflate for faster crc and deflate algorithms])], @@ -280,6 +286,29 @@ produced elsewhere unreadable) or resolve this error to build HTSlib.]) static_LIBS="$static_LIBS -llzma" fi +AS_IF([test "x$with_external_htscodecs" != "xno"], + [libhtscodecs=ok + AC_CHECK_HEADER([htscodecs/rANS_static4x16.h],[], + [libhtscodecs='missing header'],[;]) + AC_CHECK_LIB([htscodecs],[rans_compress_bound_4x16], + [:],[libhtscodecs='missing library']) + AS_IF([test "$libhtscodecs" = "ok"], + [AC_DEFINE([HAVE_EXTERNAL_LIBHTSCODECS], 1, [Define if using an external libhtscodecs]) + LIBS="-lhtscodecs $LIBS" + private_LIBS="-lhtscodecs $private_LIBS" + static_LIBS="-lhtscodecs $static_LIBS" + selected_htscodecs_mk="htscodecs_external.mk"], + [MSG_ERROR([libhtscodecs development files not found: $libhtscodecs + +You asked to use an external htscodecs library, but do not have the +required header / library files. You either need to supply these and +if necessary set CPPFLAGS and LDFLAGS so the compiler can find them; +or configure using --without-external-htscodecs to build the required +functions from the htscodecs submodule. +])])], + [selected_htscodecs_mk="htscodecs_bundled.mk"]) +AC_SUBST([selected_htscodecs_mk]) + AS_IF([test "x$with_libdeflate" != "xno"], [libdeflate=ok AC_CHECK_HEADER([libdeflate.h],[],[libdeflate='missing header'],[;]) @@ -421,5 +450,5 @@ AC_SUBST([private_LIBS]) AC_SUBST([static_LDFLAGS]) AC_SUBST([static_LIBS]) -AC_CONFIG_FILES([config.mk htslib.pc.tmp:htslib.pc.in]) +AC_CONFIG_FILES([config.mk htscodecs.mk htslib.pc.tmp:htslib.pc.in]) AC_OUTPUT diff --git a/cram/cram_io.c b/cram/cram_io.c index 0a6814c9b..d5ad50000 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -73,7 +73,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "os.h" #include "../htslib/hts.h" #include "open_trace_file.h" -#include "rANS_static.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "../htscodecs/htscodecs/rANS_static.h" +#endif //#define REF_DEBUG diff --git a/cram/rANS_byte.h b/cram/rANS_byte.h deleted file mode 100644 index f8bcae248..000000000 --- a/cram/rANS_byte.h +++ /dev/null @@ -1,352 +0,0 @@ -/* rans_byte.h originally from https://github.com/rygorous/ryg_rans - * - * This is a public-domain implementation of several rANS variants. rANS is an - * entropy coder from the ANS family, as described in Jarek Duda's paper - * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). - */ - -/*-------------------------------------------------------------------------- */ - -// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 -// -// Not intended to be "industrial strength"; just meant to illustrate the general -// idea. - -#ifndef RANS_BYTE_HEADER -#define RANS_BYTE_HEADER - -#include - -#ifdef assert -#define RansAssert assert -#else -#define RansAssert(x) -#endif - -// READ ME FIRST: -// -// This is designed like a typical arithmetic coder API, but there's three -// twists you absolutely should be aware of before you start hacking: -// -// 1. You need to encode data in *reverse* - last symbol first. rANS works -// like a stack: last in, first out. -// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give -// it a pointer to the *end* of your buffer (exclusive), and it will -// slowly move towards the beginning as more bytes are emitted. -// 3. Unlike basically any other entropy coder implementation you might -// have used, you can interleave data from multiple independent rANS -// encoders into the same bytestream without any extra signaling; -// you can also just write some bytes by yourself in the middle if -// you want to. This is in addition to the usual arithmetic encoder -// property of being able to switch models on the fly. Writing raw -// bytes can be useful when you have some data that you know is -// incompressible, and is cheaper than going through the rANS encode -// function. Using multiple rANS coders on the same byte stream wastes -// a few bytes compared to using just one, but execution of two -// independent encoders can happen in parallel on superscalar and -// Out-of-Order CPUs, so this can be *much* faster in tight decoding -// loops. -// -// This is why all the rANS functions take the write pointer as an -// argument instead of just storing it in some context struct. - -// -------------------------------------------------------------------------- - -// L ('l' in the paper) is the lower bound of our normalization interval. -// Between this and our byte-aligned emission, we use 31 (not 32!) bits. -// This is done intentionally because exact reciprocals for 31-bit uints -// fit in 32-bit uints: this permits some optimizations during encoding. -#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval - -// State for a rANS encoder. Yep, that's all there is to it. -typedef uint32_t RansState; - -// Initialize a rANS encoder. -static inline void RansEncInit(RansState* r) -{ - *r = RANS_BYTE_L; -} - -// Renormalize the encoder. Internal function. -static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) -{ - uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift. - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - return x; -} - -// Encodes a single symbol with range start "start" and frequency "freq". -// All frequencies are assumed to sum to "1 << scale_bits", and the -// resulting bytes get written to ptr (which is updated). -// -// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from -// beginning to end! Likewise, the output bytestream is written *backwards*: -// ptr starts pointing at the end of the output buffer and keeps decrementing. -static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - // renormalize - RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); - - // x = C(s,x) - *r = ((x / freq) << scale_bits) + (x % freq) + start; -} - -// Flushes the rANS encoder. -static inline void RansEncFlush(RansState* r, uint8_t** pptr) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - - ptr -= 4; - ptr[0] = (uint8_t) (x >> 0); - ptr[1] = (uint8_t) (x >> 8); - ptr[2] = (uint8_t) (x >> 16); - ptr[3] = (uint8_t) (x >> 24); - - *pptr = ptr; -} - -// Initializes a rANS decoder. -// Unlike the encoder, the decoder works forwards as you'd expect. -static inline void RansDecInit(RansState* r, uint8_t** pptr) -{ - uint32_t x; - uint8_t* ptr = *pptr; - - x = ((uint32_t) ptr[0]) << 0; - x |= ((uint32_t) ptr[1]) << 8; - x |= ((uint32_t) ptr[2]) << 16; - x |= ((uint32_t) ptr[3]) << 24; - ptr += 4; - - *pptr = ptr; - *r = x; -} - -// Returns the current cumulative frequency (map it to a symbol yourself!) -static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) -{ - return *r & ((1u << scale_bits) - 1); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", -// and the resulting bytes get written to ptr (which is updated). -static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - x = freq * (x >> scale_bits) + (x & mask) - start; - - // renormalize - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); - *pptr = ptr; - } - - *r = x; -} - -// -------------------------------------------------------------------------- - -// That's all you need for a full encoder; below here are some utility -// functions with extra convenience or optimizations. - -// Encoder symbol description -// This (admittedly odd) selection of parameters was chosen to make -// RansEncPutSymbol as cheap as possible. -typedef struct { - uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval - uint32_t rcp_freq; // Fixed-point reciprocal frequency - uint32_t bias; // Bias - uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq - uint16_t rcp_shift; // Reciprocal shift -} RansEncSymbol; - -// Decoder symbols are straightforward. -typedef struct { - uint16_t start; // Start of range. - uint16_t freq; // Symbol frequency. -} RansDecSymbol; - -// Initializes an encoder symbol to start "start" and frequency "freq" -static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - RansAssert(scale_bits <= 16); - RansAssert(start <= (1u << scale_bits)); - RansAssert(freq <= (1u << scale_bits) - start); - - // Say M := 1 << scale_bits. - // - // The original encoder does: - // x_new = (x/freq)*M + start + (x%freq) - // - // The fast encoder does (schematically): - // q = mul_hi(x, rcp_freq) >> rcp_shift (division) - // r = x - q*freq (remainder) - // x_new = q*M + bias + r (new x) - // plugging in r into x_new yields: - // x_new = bias + x + q*(M - freq) - // =: bias + x + q*cmpl_freq (*) - // - // and we can just precompute cmpl_freq. Now we just need to - // set up our parameters such that the original encoder and - // the fast encoder agree. - - s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; - s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); - if (freq < 2) { - // freq=0 symbols are never valid to encode, so it doesn't matter what - // we set our values to. - // - // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, - // our fixed-point reciprocal approximation can only multiply by values - // smaller than 1. - // - // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. - // This gives: - // q = mul_hi(x, rcp_freq) >> rcp_shift - // = mul_hi(x, (1<<32) - 1)) >> 0 - // = floor(x - x/(2^32)) - // = x - 1 if 1 <= x < 2^32 - // and we know that x>0 (x=0 is never in a valid normalization interval). - // - // So we now need to choose the other parameters such that - // x_new = x*M + start - // plug it in: - // x*M + start (desired result) - // = bias + x + q*cmpl_freq (*) - // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) - // = bias + 1 + (x - 1)*M - // = x*M + (bias + 1 - M) - // - // so we have start = bias + 1 - M, or equivalently - // bias = start + M - 1. - s->rcp_freq = ~0u; - s->rcp_shift = 0; - s->bias = start + (1 << scale_bits) - 1; - } else { - // Alverson, "Integer Division using reciprocals" - // shift=ceil(log2(freq)) - uint32_t shift = 0; - while (freq > (1u << shift)) - shift++; - - s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); - s->rcp_shift = shift - 1; - - // With these values, 'q' is the correct quotient, so we - // have bias=start. - s->bias = start; - } - - s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol -} - -// Initialize a decoder symbol to start "start" and frequency "freq" -static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) -{ - RansAssert(start <= (1 << 16)); - RansAssert(freq <= (1 << 16) - start); - s->start = (uint16_t) start; - s->freq = (uint16_t) freq; -} - -// Encodes a given symbol. This is faster than straight RansEnc since we can do -// multiplications instead of a divide. -// -// See RansEncSymbolInit for a description of how this works. -static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) -{ - RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 - - // renormalize - uint32_t x = *r; - uint32_t x_max = sym->x_max; - - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - - // x = C(s,x) - // NOTE: written this way so we get a 32-bit "multiply high" when - // available. If you're on a 64-bit platform with cheap multiplies - // (e.g. x64), just bake the +32 into rcp_shift. - //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; - - // The extra >>32 has already been added to RansEncSymbolInit - uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); - *r = x + sym->bias + q * sym->cmpl_freq; -} - -// Equivalent to RansDecAdvance that takes a symbol. -static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". -// No renormalization or output happens. -static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - *r = freq * (x >> scale_bits) + (x & mask) - start; -} - -// Equivalent to RansDecAdvanceStep that takes a symbol. -static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); -} - -// Renormalize. -static inline void RansDecRenorm(RansState* r, uint8_t** pptr) -{ - // renormalize - uint32_t x = *r; - - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L) - x = (x << 8) | *ptr++; - *pptr = ptr; - } - - *r = x; -} - -// Renormalize, with extra checks for falling off the end of the input. -static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - if (x >= RANS_BYTE_L || ptr >= ptr_end) return; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L && ptr < ptr_end) - x = (x << 8) | *ptr++; - *pptr = ptr; - *r = x; -} - - -#endif // RANS_BYTE_HEADER diff --git a/cram/rANS_static.c b/cram/rANS_static.c deleted file mode 100644 index 584f8b561..000000000 --- a/cram/rANS_static.c +++ /dev/null @@ -1,912 +0,0 @@ -/* - * Copyright (c) 2014-2019 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014 - */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rANS_static.h" -#include "rANS_byte.h" - -#define TF_SHIFT 12 -#define TOTFREQ (1<0?(a):-(a)) -#ifndef BLK_SIZE -# define BLK_SIZE 1024*1024 -#endif - -// Room to allow for expanded BLK_SIZE on worst case compression. -#define BLK_SIZE2 ((int)(1.05*BLK_SIZE)) - -/*----------------------------------------------------------------------------- - * Memory to memory compression functions. - * - * These are original versions without any manual loop unrolling. They - * are easier to understand, but can be up to 2x slower. - */ - -unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9); - unsigned char *cp, *out_end; - RansEncSymbol syms[256]; - RansState rans0, rans1, rans2, rans3; - uint8_t* ptr; - int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0; - int m = 0, M = 0; - uint64_t tr; - - if (!out_buf) - return NULL; - - ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - - // Compute statistics - for (i = 0; i < in_size; i++) { - F[in[i]]++; - } - tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size; - normalise_harder: - // Normalise so T[i] == TOTFREQ - for (fsum = m = M = j = 0; j < 256; j++) { - if (!F[j]) - continue; - - if (m < F[j]) - m = F[j], M = j; - - if ((F[j] = (F[j]*tr)>>31) == 0) - F[j] = 1; - fsum += F[j]; - } - - fsum++; - if (fsum < TOTFREQ) { - F[M] += TOTFREQ-fsum; - } else if (fsum-TOTFREQ > F[M]/2) { - // Corner case to avoid excessive frequency reduction - tr = 2104533975; goto normalise_harder; // equiv to *0.98. - } else { - F[M] -= fsum-TOTFREQ; - } - - //printf("F[%d]=%d\n", M, F[M]); - assert(F[M]>0); - - // Encode statistics. - cp = out_buf+9; - - for (x = rle = j = 0; j < 256; j++) { - if (F[j]) { - // j - if (rle) { - rle--; - } else { - *cp++ = j; - if (!rle && j && F[j-1]) { - for(rle=j+1; rle<256 && F[rle]; rle++) - ; - rle -= j+1; - *cp++ = rle; - } - //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]); - } - - // F[j] - if (F[j]<128) { - *cp++ = F[j]; - } else { - *cp++ = 128 | (F[j]>>8); - *cp++ = F[j]&0xff; - } - RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); - x += F[j]; - } - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp-out_buf; - - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - switch (i=(in_size&3)) { - case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); // fall through - case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); // fall through - case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); // fall through - case 0: - break; - } - for (i=(in_size &~3); i>0; i-=4) { - RansEncSymbol *s3 = &syms[in[i-1]]; - RansEncSymbol *s2 = &syms[in[i-2]]; - RansEncSymbol *s1 = &syms[in[i-3]]; - RansEncSymbol *s0 = &syms[in[i-4]]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - } - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - // Finalise block size and return it - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - - *cp++ = 0; // order - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - return out_buf; -} - -typedef struct { - unsigned char R[TOTFREQ]; -} ari_decoder; - -unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *cp_end = in + in_size; - int i, j, x, rle; - unsigned int out_sz, in_sz; - char *out_buf; - ari_decoder D; - RansDecSymbol syms[256]; - - if (in_size < 26) // Need at least this many bytes just to start - return NULL; - - if (*in++ != 0) // Order-0 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // Precompute reverse lookup of frequency. - rle = x = 0; - j = *cp++; - do { - int F, C; - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - RansDecSymbolInit(&syms[j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - return NULL; - memset(&D.R[x], j, F); - - x += F; - - if (!rle && j+1 == *cp) { - j = *cp++; - rle = *cp++; - } else if (rle) { - rle--; - j++; - if (j > 255) - return NULL; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - return NULL; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D.R[x] = D.R[x-1]; - - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - RansDecInit(&rans0, &ptr); - RansDecInit(&rans1, &ptr); - RansDecInit(&rans2, &ptr); - RansDecInit(&rans3, &ptr); - - out_buf = malloc(out_sz); - if (!out_buf) - return NULL; - - int out_end = (out_sz&~3); - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - uint32_t mask = (1u << TF_SHIFT)-1; - - for (i=0; i < out_end; i+=4) { - uint32_t m[4] = {R[0] & mask, - R[1] & mask, - R[2] & mask, - R[3] & mask}; - uint8_t c[4] = {D.R[m[0]], - D.R[m[1]], - D.R[m[2]], - D.R[m[3]]}; - out_buf[i+0] = c[0]; - out_buf[i+1] = c[1]; - out_buf[i+2] = c[2]; - out_buf[i+3] = c[3]; - - // In theory all TOTFREQ elements of D.R are filled out, but it's - // possible this may not be true (invalid input). We could - // check with x == TOTFREQ after filling out D.R matrix, but - // for historical reasons this sums to TOTFREQ-1 leaving one - // byte in D.R uninitialised. Or we could check here that - // syms[c[0..3]].freq > 0 and initialising syms, but that is - // slow. - // - // We take the former approach and accept a potential for garbage in - // -> garbage out in the rare 1 in TOTFREQ case as the overhead of - // continuous validation of freq > 0 is steep on this tight loop. - - // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT); - R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[c[0]].start; - R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[c[1]].start; - R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[c[2]].start; - R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[c[3]].start; - - if (ptr < cp_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, cp_end); - RansDecRenormSafe(&R[1], &ptr, cp_end); - RansDecRenormSafe(&R[2], &ptr, cp_end); - RansDecRenormSafe(&R[3], &ptr, cp_end); - } - } - - switch(out_sz&3) { - case 3: - out_buf[out_end+2] = D.R[RansDecGet(&R[2], TF_SHIFT)]; - // fall through - case 2: - out_buf[out_end+1] = D.R[RansDecGet(&R[1], TF_SHIFT)]; - // fall through - case 1: - out_buf[out_end] = D.R[RansDecGet(&R[0], TF_SHIFT)]; - // fall through - default: - break; - } - - *out_size = out_sz; - - return (unsigned char *)out_buf; -} - -unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = NULL, *out_end, *cp; - unsigned int last_i, tab_size, rle_i, rle_j; - RansEncSymbol (*syms)[256] = NULL; /* syms[256][256] */ - int (*F)[256] = NULL; /* F[256][256] */ - int *T = NULL; /* T[256] */ - int i, j; - unsigned char c; - - if (in_size < 4) - return rans_compress_O0(in, in_size, out_size); - - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - F = calloc(256, sizeof(*F)); - if (!F) goto cleanup; - T = calloc(256, sizeof(*T)); - if (!T) goto cleanup; - out_buf = malloc(1.05*in_size + 257*257*3 + 9); - if (!out_buf) goto cleanup; - - out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - cp = out_buf+9; - - //for (last = 0, i=in_size-1; i>=0; i--) { - // F[last][c = in[i]]++; - // T[last]++; - // last = c; - //} - - for (last_i=i=0; i>2)]]++; - F[0][in[2*(in_size>>2)]]++; - F[0][in[3*(in_size>>2)]]++; - T[0]+=3; - - // Normalise so T[i] == TOTFREQ - for (rle_i = i = 0; i < 256; i++) { - int t2, m, M; - unsigned int x; - - if (T[i] == 0) - continue; - - //uint64_t p = (TOTFREQ * TOTFREQ) / t; - double p = ((double)TOTFREQ)/T[i]; - normalise_harder: - for (t2 = m = M = j = 0; j < 256; j++) { - if (!F[i][j]) - continue; - - if (m < F[i][j]) - m = F[i][j], M = j; - - //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0) - if ((F[i][j] *= p) == 0) - F[i][j] = 1; - t2 += F[i][j]; - } - - t2++; - if (t2 < TOTFREQ) { - F[i][M] += TOTFREQ-t2; - } else if (t2-TOTFREQ >= F[i][M]/2) { - // Corner case to avoid excessive frequency reduction - p = .98; goto normalise_harder; - } else { - F[i][M] -= t2-TOTFREQ; - } - - // Store frequency table - // i - if (rle_i) { - rle_i--; - } else { - *cp++ = i; - // FIXME: could use order-0 statistics to observe which alphabet - // symbols are present and base RLE on that ordering instead. - if (i && T[i-1]) { - for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++) - ; - rle_i -= i+1; - *cp++ = rle_i; - } - } - - int *F_i_ = F[i]; - x = 0; - rle_j = 0; - for (j = 0; j < 256; j++) { - if (F_i_[j]) { - //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x); - - // j - if (rle_j) { - rle_j--; - } else { - *cp++ = j; - if (!rle_j && j && F_i_[j-1]) { - for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++) - ; - rle_j -= j+1; - *cp++ = rle_j; - } - } - - // F_i_[j] - if (F_i_[j]<128) { - *cp++ = F_i_[j]; - } else { - *cp++ = 128 | (F_i_[j]>>8); - *cp++ = F_i_[j]&0xff; - } - - RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT); - x += F_i_[j]; - } - } - *cp++ = 0; - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp - out_buf; - assert(tab_size < 257*257*3); - - RansState rans0, rans1, rans2, rans3; - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - uint8_t* ptr = out_end; - - int isz4 = in_size>>2; - int i0 = 1*isz4-2; - int i1 = 2*isz4-2; - int i2 = 3*isz4-2; - int i3 = 4*isz4-2; - - unsigned char l0 = in[i0+1]; - unsigned char l1 = in[i1+1]; - unsigned char l2 = in[i2+1]; - unsigned char l3 = in[i3+1]; - - // Deal with the remainder - l3 = in[in_size-1]; - for (i3 = in_size-2; i3 > 4*isz4-2; i3--) { - unsigned char c3 = in[i3]; - RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]); - l3 = c3; - } - - for (; i0 >= 0; i0--, i1--, i2--, i3--) { - unsigned char c0, c1, c2, c3; - RansEncSymbol *s3 = &syms[c3 = in[i3]][l3]; - RansEncSymbol *s2 = &syms[c2 = in[i2]][l2]; - RansEncSymbol *s1 = &syms[c1 = in[i1]][l1]; - RansEncSymbol *s0 = &syms[c0 = in[i0]][l0]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - - l0 = c0; - l1 = c1; - l2 = c2; - l3 = c3; - } - - RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]); - RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]); - RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]); - RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]); - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - *cp++ = 1; // order - - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - cleanup: - free(syms); - free(F); - free(T); - - return out_buf; -} - -unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *ptr_end = in + in_size; - int i, j = -999, x, rle_i, rle_j; - unsigned int out_sz, in_sz; - char *out_buf = NULL; - ari_decoder *D = NULL; /* D[256] */ - RansDecSymbol (*syms)[256] = NULL; /* syms[256][256] */ - - if (in_size < 27) // Need at least this many bytes to start - return NULL; - - if (*in++ != 1) // Order-1 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // calloc may add 2% overhead to CRAM decode, but on linux with glibc it's - // often the same thing due to using mmap. - D = calloc(256, sizeof(*D)); - if (!D) goto cleanup; - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - /* These memsets prevent illegal memory access in syms due to - broken compressed data. As D is calloc'd, all illegal transitions - will end up in either row or column 0 of syms. */ - memset(&syms[0], 0, sizeof(syms[0])); - for (i = 1; i < 256; i++) memset(&syms[i][0], 0, sizeof(syms[0][0])); - - //fprintf(stderr, "out_sz=%d\n", out_sz); - - //i = *cp++; - rle_i = 0; - i = *cp++; - do { - rle_j = x = 0; - j = *cp++; - do { - int F, C; - if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C); - - if (!F) - F = TOTFREQ; - - RansDecSymbolInit(&syms[i][j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - goto cleanup; - memset(&D[i].R[x], j, F); - - x += F; - - if (!rle_j && j+1 == *cp) { - j = *cp++; - rle_j = *cp++; - } else if (rle_j) { - rle_j--; - j++; - if (j > 255) - goto cleanup; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - goto cleanup; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D[i].R[x] = D[i].R[x-1]; - - if (!rle_i && i+1 == *cp) { - i = *cp++; - rle_i = *cp++; - } else if (rle_i) { - rle_i--; - i++; - if (i > 255) - goto cleanup; - } else { - i = *cp++; - } - } while (i); - - // Precompute reverse lookup of frequency. - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - if (ptr > ptr_end - 16) goto cleanup; // Not enough input bytes left - RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto cleanup; - - int isz4 = out_sz>>2; - int l0 = 0; - int l1 = 0; - int l2 = 0; - int l3 = 0; - int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4}; - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - - /* Allocate output buffer */ - out_buf = malloc(out_sz); - if (!out_buf) goto cleanup; - - for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { - uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1), - R[1] & ((1u << TF_SHIFT)-1), - R[2] & ((1u << TF_SHIFT)-1), - R[3] & ((1u << TF_SHIFT)-1)}; - - uint8_t c[4] = {D[l0].R[m[0]], - D[l1].R[m[1]], - D[l2].R[m[2]], - D[l3].R[m[3]]}; - - out_buf[i4[0]] = c[0]; - out_buf[i4[1]] = c[1]; - out_buf[i4[2]] = c[2]; - out_buf[i4[3]] = c[3]; - - //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT); - - R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[l0][c[0]].start; - R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[l1][c[1]].start; - R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[l2][c[2]].start; - R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[l3][c[3]].start; - - if (ptr < ptr_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, ptr_end); - RansDecRenormSafe(&R[1], &ptr, ptr_end); - RansDecRenormSafe(&R[2], &ptr, ptr_end); - RansDecRenormSafe(&R[3], &ptr, ptr_end); - } - - l0 = c[0]; - l1 = c[1]; - l2 = c[2]; - l3 = c[3]; - } - - // Remainder - for (; i4[3] < out_sz; i4[3]++) { - unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)]; - out_buf[i4[3]] = c3; - - uint32_t m = R[3] & ((1u << TF_SHIFT)-1); - R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start; - RansDecRenormSafe(&R[3], &ptr, ptr_end); - l3 = c3; - } - - *out_size = out_sz; - - cleanup: - if (D) - free(D); - free(syms); - - return (unsigned char *)out_buf; -} - -/*----------------------------------------------------------------------------- - * Simple interface to the order-0 vs order-1 encoders and decoders. - */ -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order) { - return order - ? rans_compress_O1(in, in_size, out_size) - : rans_compress_O0(in, in_size, out_size); -} - -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Both rans_uncompress functions need to be able to read at least 9 - bytes. */ - if (in_size < 9) - return NULL; - return in[0] - ? rans_uncompress_O1(in, in_size, out_size) - : rans_uncompress_O0(in, in_size, out_size); -} - - -#ifdef TEST_MAIN -/*----------------------------------------------------------------------------- - * Main. - * - * This is a simple command line tool for testing order-0 and order-1 - * compression using the rANS codec. Simply compile with - * - * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static - * - * Usage: cram/rANS_static -o0 < file > file.o0 - * cram/rANS_static -d < file.o0 > file2 - * - * cram/rANS_static -o1 < file > file.o1 - * cram/rANS_static -d < file.o1 > file2 - */ -int main(int argc, char **argv) { - int opt, order = 0; - unsigned char in_buf[BLK_SIZE2+257*257*3]; - int decode = 0; - FILE *infp = stdin, *outfp = stdout; - struct timeval tv1, tv2; - size_t bytes = 0; - - extern char *optarg; - extern int optind; - - while ((opt = getopt(argc, argv, "o:d")) != -1) { - switch (opt) { - case 'o': - order = atoi(optarg); - break; - - case 'd': - decode = 1; - break; - } - } - - order = order ? 1 : 0; // Only support O(0) and O(1) - - if (optind < argc) { - if (!(infp = fopen(argv[optind], "rb"))) { - perror(argv[optind]); - return 1; - } - optind++; - } - - if (optind < argc) { - if (!(outfp = fopen(argv[optind], "wb"))) { - perror(argv[optind]); - fclose(infp); - return 1; - } - optind++; - } - - gettimeofday(&tv1, NULL); - - if (decode) { - // Only used in some test implementations of RC_GetFreq() - //RC_init(); - //RC_init2(); - - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - if (9 != fread(in_buf, 1, 9, infp)) - break; - in_size = *(int *)&in_buf[1]; - if (in_size != fread(in_buf+9, 1, in_size, infp)) { - fprintf(stderr, "Truncated input\n"); - exit(1); - } - out = rans_uncompress(in_buf, in_size+9, &out_size); - if (!out) - abort(); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += out_size; - } - } else { - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - in_size = fread(in_buf, 1, BLK_SIZE, infp); - if (in_size <= 0) - break; - - out = rans_compress(in_buf, in_size, &out_size, order); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += in_size; - } - } - - gettimeofday(&tv2, NULL); - - fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n", - (long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec, - (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec)); - - if (infp != stdin) fclose(infp); - if (outfp != stdout) fclose(outfp); - - return 0; -} -#endif diff --git a/cram/rANS_static.h b/cram/rANS_static.h deleted file mode 100644 index 5c3cf2cbf..000000000 --- a/cram/rANS_static.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2014 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef RANS_STATIC_H -#define RANS_STATIC_H - -#ifdef __cplusplus -extern "C" { -#endif - -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order); -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size); - -#ifdef __cplusplus -} -#endif - -#endif /* RANS_STATIC_H */ diff --git a/htscodecs b/htscodecs new file mode 160000 index 000000000..4e06c5d79 --- /dev/null +++ b/htscodecs @@ -0,0 +1 @@ +Subproject commit 4e06c5d795b2a603bebf141cb88082901b41399c diff --git a/htscodecs.mk.in b/htscodecs.mk.in new file mode 100644 index 000000000..f3bba4f31 --- /dev/null +++ b/htscodecs.mk.in @@ -0,0 +1,5 @@ +# This is @configure_input@ + +# This file selects Makefile rules for htscodecs + +include $(HTSPREFIX)@selected_htscodecs_mk@ diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk new file mode 100644 index 000000000..fac3fd956 --- /dev/null +++ b/htscodecs_bundled.mk @@ -0,0 +1,3 @@ +HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c + +HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk new file mode 100644 index 000000000..8d5640842 --- /dev/null +++ b/htscodecs_external.mk @@ -0,0 +1,6 @@ +HTSCODECS_SOURCES = +HTSCODECS_OBJS = + +htscodecs_rANS_static_h = + +htscodecs_rANS_byte_h = diff --git a/htslib.mk b/htslib.mk index 254c25797..4d0fa0205 100644 --- a/htslib.mk +++ b/htslib.mk @@ -35,6 +35,13 @@ HTSPREFIX = $(HTSDIR)/ include $(HTSDIR)/htslib_vars.mk +# This file provides the HTSCODECS_SOURCES variable. It may not be present +# in a freshly checked-out htslib, so is only included if available. The +# absence is unlikely to cause a problem as there will be plenty of other +# missing files that will trigger a build in htslib, and when that happens +# htslib's makefile will create it. +-include $(HTSDIR)/htscodecs.mk + # Rules for rebuilding an in-development htslib's static and shared libraries. # If your program foo links with libhts, adding the appropriate prerequisite # will cause the library to be rebuilt as necessary: @@ -138,13 +145,11 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/os.h \ $(HTSDIR)/cram/pooled_alloc.c \ $(HTSDIR)/cram/pooled_alloc.h \ - $(HTSDIR)/cram/rANS_byte.h \ - $(HTSDIR)/cram/rANS_static.c \ - $(HTSDIR)/cram/rANS_static.h \ $(HTSDIR)/cram/string_alloc.c \ $(HTSDIR)/cram/string_alloc.h \ $(HTSDIR)/os/lzma_stub.h \ - $(HTSDIR)/os/rand.c + $(HTSDIR)/os/rand.c \ + $(HTSCODECS_SOURCES) $(HTSDIR)/config.h: +cd $(HTSDIR) && $(MAKE) config.h diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 161a7214c..43fb5be4e 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -43,6 +43,9 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + # Exclusions: my %exclude = map { ("$root/$_", 1) } ( 'config.h', # Auto-generated diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index d5a53b1ed..81b4ededc 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -43,6 +43,9 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + my %allow_tabs = map { ("$root/$_", 1) } ( 'kfunc.c', 'kstring.c', From 4a38d3d5dcb26e09ab33dd5195995b58120b3958 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 16 Sep 2019 17:56:23 +0100 Subject: [PATCH 036/114] Added CRAM 3.1 encode and decode support. Note: this is experimental, subject to the 3.1 spec being agreed upon by the GA4GH File Formats committee. Example usage (high compression): ./test/test_view -C -o version=3.1 -o use_fqz -o use_bzip2 -o use_arith \ -o seqs_per_slice=100000 /tmp/30.cram -p /tmp/31-max.cram Also added support for compression profiles. Primarily these are CRAM only, but for consistency they're also used as synonyms for compression levels in bgzf. In the process of adding CRAM 3.1 several improvements and refactoring took place to the cram encoder (mainly by dint of copying over the cram_compress_block function from io_lib): - Added a "Gzip -1" deflate strategy as sometimes it's smaller than gzip or comparable size (but considerably fatser). This is purely for metrics and optimisation. - Refactored the cram_compress_block function so it can deal with many more compression method / strategies. It now has a number of lookup tables which need to be kept synchronised (see comments). - Added a new cram_compress_block2 function which takes a slice. This is used for the fqzcomp code to obtain the read lengths. - Cram_compress_block now has a consistency metric too. This is to learn when data isn't changing rapidly so don't be too quick to reevaluate. --- Makefile | 17 +- cram/cram_encode.c | 121 ++++++-- cram/cram_io.c | 701 +++++++++++++++++++++++++++--------------- cram/cram_io.h | 3 + cram/cram_structs.h | 45 +-- hts.c | 52 +++- htscodecs_bundled.mk | 7 +- htscodecs_external.mk | 9 + htslib/cram.h | 45 ++- htslib/hts.h | 13 + 10 files changed, 691 insertions(+), 322 deletions(-) diff --git a/Makefile b/Makefile index 53a39d13a..d6525a516 100644 --- a/Makefile +++ b/Makefile @@ -212,9 +212,18 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) +htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h +htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h +htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h +htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h +htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set @@ -374,7 +383,7 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) @@ -382,7 +391,13 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) + bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 66f27eebf..7d2f5a9a5 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -825,86 +825,139 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { /* Compress the CORE Block too, with minimal zlib level */ if (level > 5 && s->block[0]->uncomp_size > 500) - cram_compress_block(fd, s->block[0], NULL, 1<block[0], NULL, 1<use_bz2) method |= 1<use_rans) - method |= (1<use_rans) { + method_ranspr = (1< 1) + method_ranspr |= + (1< 5) + method_ranspr |= (1<version >= (3<<8)+1); + if (fd->use_rans) { + methodF |= v31_or_above ? method_ranspr : method_rans; + method |= v31_or_above ? method_ranspr : method_rans; + } + + int method_arith = 0; + if (fd->use_arith) { + method_arith = (1< 1) + method_arith |= + (1<use_arith && v31_or_above) { + methodF |= method_arith; + method |= method_arith; + } if (fd->use_lzma) method |= (1<= 6) + if (level >= 5) { + method |= 1<use_fqz) { + qmethod |= 1<level > 4) { + qmethod |= 1<level > 6) { + qmethod |= (1<block[DS_IN], fd->m[DS_IN], //IN (seq) - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq) + method, level)) return -1; if (fd->level == 0) { /* Do nothing */ } else if (fd->level == 1) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - methodF, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethodF, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, 1)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, 1)) return -1; } } else if (fd->level < 3) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, 1)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, 1)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } else { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, level)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, level)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, level)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } // NAME: best is generally xz, bzip2, zlib then rans1 - if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], - method & ~(1<version >= (3<<8)+1 && fd->use_tok) + method_rn |= fd->use_arith ? (1<block[DS_RN], fd->m[DS_RN], + method_rn, level)) return -1; // NS shows strong local correlation as rearrangements are localised if (s->block[DS_NS] != s->block[0]) - if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS], + method, level)) return -1; @@ -920,8 +973,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->aux_block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->aux_block[i], s->aux_block[i]->m, - method, level)) + if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m, + method, level)) return -1; } } @@ -938,7 +991,7 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->block[i], fd->m[i], + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], methodF, level)) return -1; } diff --git a/cram/cram_io.c b/cram/cram_io.c index d5ad50000..f771d2496 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -76,8 +76,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_EXTERNAL_LIBHTSCODECS) #include +#include +#include +#include +#include #else #include "../htscodecs/htscodecs/rANS_static.h" +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#include "../htscodecs/htscodecs/arith_dynamic.h" +#include "../htscodecs/htscodecs/tokenise_name3.h" +#include "../htscodecs/htscodecs/fqzcomp_qual.h" #endif //#define REF_DEBUG @@ -103,6 +111,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define TRIAL_SPAN 50 #define NTRIALS 3 +#define CRAM_DEFAULT_LEVEL 5 /* ---------------------------------------------------------------------- * ITF8 encoding and decoding. @@ -1182,6 +1191,62 @@ int cram_uncompress_block(cram_block *b) { break; } + case FQZ: { + uncomp_size = b->uncomp_size; + uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); + if (!uncomp) + return -1; + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = uncomp_size; + b->method = RAW; + break; + } + + case RANS_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); + if (!uncomp || usize != usize2) + return -1; + b->orig_method = RANS_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case ARITH_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); + if (!uncomp || usize != usize2) + return -1; + b->orig_method = ARITH_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case NAME_TOK3: { + uint32_t out_len; + uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + b->orig_method = NAME_TOK3; + b->method = RAW; + free(b->data); + b->data = cp; + b->alloc = out_len; + b->uncomp_size = out_len; + break; + } + default: return -1; } @@ -1189,12 +1254,14 @@ int cram_uncompress_block(cram_block *b) { return 0; } -static char *cram_compress_by_method(char *in, size_t in_size, +static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int content_id, size_t *out_size, enum cram_block_method method, int level, int strat) { switch (method) { case GZIP: + case GZIP_RLE: + case GZIP_1: // Read names bizarrely benefit from zlib over libdeflate for // mid-range compression levels. Focusing purely of ratio or // speed, libdeflate still wins. It also seems to win for @@ -1230,6 +1297,32 @@ static char *cram_compress_by_method(char *in, size_t in_size, #endif } + case FQZ: + case FQZ_b: + case FQZ_c: + case FQZ_d: { + // Extract the necessary portion of the slice into an fqz_slice struct. + // These previously were the same thing, but this permits us to detach + // the codec from the rest of this CRAM implementation. + fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); + if (!f) + return NULL; + f->num_records = s->hdr->num_records; + f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); + f->flags = f->len + s->hdr->num_records; + int i; + for (i = 0; i < s->hdr->num_records; i++) { + f->flags[i] = s->crecs[i].flags; + f->len[i] = (i+1 < s->hdr->num_records + ? s->crecs[i+1].qual - s->crecs[i].qual + : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); + } + char *comp = fqz_compress(strat & 0xff /* cram vers */, f, + in, in_size, out_size, strat >> 8, NULL); + free(f); + return comp; + } + case LZMA: #ifdef HAVE_LIBLZMA return lzma_mem_deflate(in, in_size, out_size, level); @@ -1237,23 +1330,67 @@ static char *cram_compress_by_method(char *in, size_t in_size, return NULL; #endif - case RANS0: { + case RANS0: + case RANS1: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0); + cp = rans_compress((unsigned char *)in, in_size, &out_size_i, + method == RANS0 ? 0 : 1); *out_size = out_size_i; return (char *)cp; } - case RANS1: { + case RANS_PR0: + case RANS_PR1: + case RANS_PR64: + case RANS_PR9: + case RANS_PR128: + case RANS_PR129: + case RANS_PR192: + case RANS_PR193: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1); + // see enum cram_block. We map RANS_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, + method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]); + *out_size = out_size_i; + return (char *)cp; + } + + case ARITH_PR0: + case ARITH_PR1: + case ARITH_PR64: + case ARITH_PR9: + case ARITH_PR128: + case ARITH_PR129: + case ARITH_PR192: + case ARITH_PR193: { + unsigned int out_size_i; + unsigned char *cp; + + // see enum cram_block. We map ARITH_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, + method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); *out_size = out_size_i; return (char *)cp; } + case NAME_TOK3: + case NAME_TOKA: { + int out_len; + int lev = level; + if (method == NAME_TOK3 && lev > 3) + lev = 3; + uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); + *out_size = out_len; + return (char *)cp; + } + case RAW: break; @@ -1275,13 +1412,34 @@ static char *cram_compress_by_method(char *in, size_t in_size, * * Method and level -1 implies defaults, as specified in cram_fd. */ -int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int method, int level) { +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level) { char *comp = NULL; size_t comp_size = 0; int strat; + // Internally we have parameterised methods that externally map + // to the same CRAM method value. + // See enum_cram_block_method. + int methmap[] = { + // Externally defined values + RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, + + // Reserved for possible expansion + 0, 0, + + // Internally parameterised versions matching back to above + // external values + GZIP, GZIP, + FQZ, FQZ, FQZ, + RANS, + RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, + TOK3, + ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, + }; + if (b->method != RAW) { // Maybe already compressed if s->block[0] was compressed and // we have e.g. s->block[DS_BA] set to s->block[0] due to only @@ -1314,14 +1472,10 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, if (metrics) { pthread_mutex_lock(&fd->metrics_lock); if (metrics->trial > 0 || --metrics->next_trial <= 0) { - size_t sz_best = INT_MAX; - size_t sz_gz_rle = 0; - size_t sz_gz_def = 0; - size_t sz_rans0 = 0; - size_t sz_rans1 = 0; - size_t sz_bzip2 = 0; - size_t sz_lzma = 0; - int method_best = 0; + int m; + size_t sz_best = b->uncomp_size; + size_t sz[CRAM_MAX_METHOD] = {0}; + int method_best = 0; // RAW char *c_best = NULL, *c = NULL; if (metrics->revised_method) @@ -1332,243 +1486,201 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, if (metrics->next_trial <= 0) { metrics->next_trial = TRIAL_SPAN; metrics->trial = NTRIALS; - metrics->sz_gz_rle /= 2; - metrics->sz_gz_def /= 2; - metrics->sz_rans0 /= 2; - metrics->sz_rans1 /= 2; - metrics->sz_bzip2 /= 2; - metrics->sz_lzma /= 2; - } - - pthread_mutex_unlock(&fd->metrics_lock); - - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_rle, GZIP, 1, Z_RLE); - if (c && sz_best > sz_gz_rle) { - sz_best = sz_gz_rle; - method_best = GZIP_RLE; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_rle = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle); - } - - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_def, GZIP, level, - Z_FILTERED); - if (c && sz_best > sz_gz_def) { - sz_best = sz_gz_def; - method_best = GZIP; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_def = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def); + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] /= 2; } - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_rans0, RANS0, 0, 0); - if (c && sz_best > sz_rans0) { - sz_best = sz_rans0; - method_best = RANS0; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_rans0 = b->uncomp_size*2+1000; - } + // Compress this block using the best method + if (metrics->stats && metrics->stats->nvals > 16) { + // No point trying bit-pack if 17+ symbols. + if (method & (1<metrics_lock); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_rans1, RANS1, 0, 0); - if (c && sz_best > sz_rans1) { - sz_best = sz_rans1; - method_best = RANS1; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if (method & (1<version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case NAME_TOK3:strat = 0; break; + case NAME_TOKA:strat = 1; break; + default: strat = 0; + } + + c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, + b->content_id, &sz[m], m, lvl, strat); + + if (c && sz_best > sz[m]) { + sz_best = sz[m]; + method_best = m; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw + } } else { - sz_rans1 = b->uncomp_size*2+1000; + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw } } + //fprintf(stderr, "sz_best = %d\n", sz_best); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_bzip2, BZIP2, level, 0); - if (c && sz_best > sz_bzip2) { - sz_best = sz_bzip2; - method_best = BZIP2; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_bzip2 = b->uncomp_size*2+1000; - } - } + if (c_best) { + free(b->data); + b->data = (unsigned char *)c_best; + //printf("method_best = %s\n", cram_block_method2str(method_best)); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_lzma, LZMA, level, 0); - if (c && sz_best > sz_lzma) { - sz_best = sz_lzma; - method_best = LZMA; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_lzma = b->uncomp_size*2+1000; - } + b->method = method_best; // adjusted to methmap[method_best] later + b->comp_size = sz_best; } - //fprintf(stderr, "sz_best = %d\n", sz_best); - - free(b->data); - b->data = (unsigned char *)c_best; - //printf("method_best = %s\n", cram_block_method2str(method_best)); - b->method = method_best == GZIP_RLE ? GZIP : method_best; - b->comp_size = sz_best; - + // Accumulate stats for all methods tried pthread_mutex_lock(&fd->metrics_lock); - metrics->sz_gz_rle += sz_gz_rle; - metrics->sz_gz_def += sz_gz_def; - metrics->sz_rans0 += sz_rans0; - metrics->sz_rans1 += sz_rans1; - metrics->sz_bzip2 += sz_bzip2; - metrics->sz_lzma += sz_lzma; + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] += sz[m]+50; // don't be overly sure on small blocks + + // When enough trials performed, find the best on average if (--metrics->trial == 0) { int best_method = RAW; int best_sz = INT_MAX; - // Scale methods by cost - if (fd->level <= 3) { - metrics->sz_rans1 *= 1.02; - metrics->sz_gz_def *= 1.04; - metrics->sz_bzip2 *= 1.08; - metrics->sz_lzma *= 1.10; + // Relative costs of methods. See enum_cram_block_method and methmap + double meth_cost[32] = { + // Externally defined methods + 1, // 0 raw + 1.04, // 1 gzip (Z_FILTERED) + 1.08, // 2 bzip2 + 1.04, // 3 lzma + 1.00, // 4 rans (O0) + 1.00, // 5 ranspr (O0) + 1.03, // 6 arithpr (O0) + 1.05, // 7 fqz + 1.05, // 8 tok3 (rans) + 9, 9, // 9,10 reserved + + // Paramterised versions of above + 1.01, // gzip rle + 1.02, // gzip -1 + + 1.05, 1.05, 1.05, // FQZ_b,c,d + + 1.01, // rans O1 + + 1.01, // rans_pr1 + 1.00, // rans_pr64; if smaller, usually fast + 1.03, // rans_pr65/9 + 1.00, // rans_pr128 + 1.01, // rans_pr129 + 1.00, // rans_pr192 + 1.01, // rans_pr193 + + 1.07, // tok3 arith + + 1.04, // arith_pr1 + 1.04, // arith_pr64 + 1.04, // arith_pr65 + 1.03, // arith_pr128 + 1.04, // arith_pr129 + 1.04, // arith_pr192 + 1.04, // arith_pr193 + }; + + // Scale methods by cost based on compression level + if (fd->level <= 1) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)*4; + } else if (fd->level <= 3) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1); } else if (fd->level <= 6) { - metrics->sz_rans1 *= 1.01; - metrics->sz_gz_def *= 1.02; - metrics->sz_bzip2 *= 1.03; - metrics->sz_lzma *= 1.05; + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/2; + } else if (fd->level <= 7) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/3; + } // else cost is ignored + + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if ((!metrics->sz[m]) || (!(method & (1< metrics->sz[m]) + best_sz = metrics->sz[m], best_method = m; } - if (method & (1< metrics->sz_gz_rle) - best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE; - - if (method & (1< metrics->sz_gz_def) - best_sz = metrics->sz_gz_def, best_method = GZIP; - - if (method & (1< metrics->sz_rans0) - best_sz = metrics->sz_rans0, best_method = RANS0; - - if (method & (1< metrics->sz_rans1) - best_sz = metrics->sz_rans1, best_method = RANS1; - - if (method & (1< metrics->sz_bzip2) - best_sz = metrics->sz_bzip2, best_method = BZIP2; - - if (method & (1< metrics->sz_lzma) - best_sz = metrics->sz_lzma, best_method = LZMA; - - if (best_method == GZIP_RLE) { - metrics->method = GZIP; - metrics->strat = Z_RLE; + if (best_method != metrics->method) { + metrics->trial = (NTRIALS+1)/2; // be sure + //metrics->next_trial /= 1.5; + metrics->consistency = 0; } else { - metrics->method = best_method; - metrics->strat = Z_FILTERED; + metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); + metrics->consistency++; } + metrics->method = best_method; + switch (best_method) { + case GZIP: strat = Z_FILTERED; break; + case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; + case GZIP_RLE: strat = Z_RLE; break; + case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + default: strat = 0; + } + metrics->strat = strat; + // If we see at least MAXFAIL trials in a row for a specific // compression method with more than MAXDELTA aggregate // size then we drop this from the list of methods used // for this block type. #define MAXDELTA 0.20 #define MAXFAILS 4 - if (best_method == GZIP_RLE) { - metrics->gz_rle_cnt = 0; - metrics->gz_rle_extra = 0; - } else if (best_sz < metrics->sz_gz_rle) { - double r = (double)metrics->sz_gz_rle / best_sz - 1; - if (++metrics->gz_rle_cnt >= MAXFAILS && - (metrics->gz_rle_extra += r) >= MAXDELTA) - method &= ~(1<gz_def_cnt = 0; - metrics->gz_def_extra = 0; - } else if (best_sz < metrics->sz_gz_def) { - double r = (double)metrics->sz_gz_def / best_sz - 1; - if (++metrics->gz_def_cnt >= MAXFAILS && - (metrics->gz_def_extra += r) >= MAXDELTA) - method &= ~(1<cnt[m] = 0; + metrics->extra[m] = 0; + } else if (best_sz < metrics->sz[m]) { + double r = (double)metrics->sz[m] / best_sz - 1; + int mul = 1+(fd->level>=7); + if (++metrics->cnt[m] >= MAXFAILS*mul && + (metrics->extra[m] += r) >= MAXDELTA*mul) + method &= ~(1<sz[m] > best_sz) + method &= ~(1<rans0_cnt = 0; - metrics->rans0_extra = 0; - } else if (best_sz < metrics->sz_rans0) { - double r = (double)metrics->sz_rans0 / best_sz - 1; - if (++metrics->rans0_cnt >= MAXFAILS && - (metrics->rans0_extra += r) >= MAXDELTA) - method &= ~(1<rans1_cnt = 0; - metrics->rans1_extra = 0; - } else if (best_sz < metrics->sz_rans1) { - double r = (double)metrics->sz_rans1 / best_sz - 1; - if (++metrics->rans1_cnt >= MAXFAILS && - (metrics->rans1_extra += r) >= MAXDELTA) - method &= ~(1<bzip2_cnt = 0; - metrics->bzip2_extra = 0; - } else if (best_sz < metrics->sz_bzip2) { - double r = (double)metrics->sz_bzip2 / best_sz - 1; - if (++metrics->bzip2_cnt >= MAXFAILS && - (metrics->bzip2_extra += r) >= MAXDELTA) - method &= ~(1<lzma_cnt = 0; - metrics->lzma_extra = 0; - } else if (best_sz < metrics->sz_lzma) { - double r = (double)metrics->sz_lzma / best_sz - 1; - if (++metrics->lzma_cnt >= MAXFAILS && - (metrics->lzma_extra += r) >= MAXDELTA) - method &= ~(1<revised_method) - // fprintf(stderr, "%d: method from %x to %x\n", + //if (fd->verbose > 1 && method != metrics->revised_method) + // fprintf(stderr, "%d: revising method from %x to %x\n", // b->content_id, metrics->revised_method, method); metrics->revised_method = method; } @@ -1578,40 +1690,55 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, method = metrics->method; pthread_mutex_unlock(&fd->metrics_lock); - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, method, - level, strat); + method == GZIP_1 ? 1 : level, + strat); if (!comp) return -1; - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = method; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = method; + } else { + free(comp); + } } } else { // no cached metrics, so just do zlib? - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, GZIP, level, Z_FILTERED); if (!comp) { - hts_log_error("Compression failed"); + hts_log_error("Compression failed!"); return -1; } - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = GZIP; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = GZIP; + } else { + free(comp); + } + strat = Z_FILTERED; } hts_log_info("Compressed block ID %d from %d to %d by method %s", b->content_id, b->uncomp_size, b->comp_size, cram_block_method2str(b->method)); - if (b->method == RANS1) - b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing + b->method = methmap[b->method]; return 0; } +int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int method, int level) { + return cram_compress_block2(fd, NULL, b, metrics, method, level); +} cram_metrics *cram_new_metrics(void) { cram_metrics *m = calloc(1, sizeof(*m)); @@ -1628,13 +1755,36 @@ cram_metrics *cram_new_metrics(void) { char *cram_block_method2str(enum cram_block_method m) { switch(m) { - case RAW: return "RAW"; - case GZIP: return "GZIP"; - case BZIP2: return "BZIP2"; - case LZMA: return "LZMA"; - case RANS0: return "RANS0"; - case RANS1: return "RANS1"; - case GZIP_RLE: return "GZIP_RLE"; + case RAW: return "RAW"; + case GZIP: return "GZIP"; + case BZIP2: return "BZIP2"; + case LZMA: return "LZMA"; + case RANS0: return "RANS0"; + case RANS1: return "RANS1"; + case GZIP_RLE: return "GZIP_RLE"; + case GZIP_1: return "GZIP_1"; + case FQZ: return "FQZ"; + case FQZ_b: return "FQZ_b"; + case FQZ_c: return "FQZ_c"; + case FQZ_d: return "FQZ_d"; + case RANS_PR0: return "RANS_PR0"; + case RANS_PR1: return "RANS_PR1"; + case RANS_PR64: return "RANS_PR64"; + case RANS_PR9: return "RANS_PR9"; + case RANS_PR128: return "RANS_PR128"; + case RANS_PR129: return "RANS_PR129"; + case RANS_PR192: return "RANS_PR192"; + case RANS_PR193: return "RANS_PR193"; + case NAME_TOK3: return "TOK3_R"; + case NAME_TOKA: return "TOK3_A"; + case ARITH_PR0: return "ARITH_PR0"; + case ARITH_PR1: return "ARITH_PR1"; + case ARITH_PR64: return "ARITH_PR64"; + case ARITH_PR9: return "ARITH_PR9"; + case ARITH_PR128: return "ARITH_PR128"; + case ARITH_PR129: return "ARITH_PR129"; + case ARITH_PR192: return "ARITH_PR192"; + case ARITH_PR193: return "ARITH_PR193"; case BM_ERROR: break; } return "?"; @@ -3496,12 +3646,7 @@ void reset_metrics(cram_fd *fd) { m->next_trial = TRIAL_SPAN; m->revised_method = 0; - m->sz_gz_rle = 0; - m->sz_gz_def = 0; - m->sz_rans0 = 0; - m->sz_rans1 = 0; - m->sz_bzip2 = 0; - m->sz_lzma = 0; + memset(m->sz, 0, sizeof(m->sz)); } } @@ -4435,7 +4580,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { if (!fd) return NULL; - fd->level = 5; + fd->level = CRAM_DEFAULT_LEVEL; for (i = 0; mode[i]; i++) { if (mode[i] >= '0' && mode[i] <= '9') { fd->level = mode[i] - '0'; @@ -4510,6 +4655,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->lossy_read_names = 0; fd->use_bz2 = 0; fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); + fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); fd->use_lzma = 0; fd->multi_seq = -1; fd->multi_seq_user = -1; @@ -4793,6 +4939,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { case CRAM_OPT_SEQS_PER_SLICE: fd->seqs_per_slice = va_arg(args, int); + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; break; case CRAM_OPT_BASES_PER_SLICE: @@ -4833,6 +4981,18 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = va_arg(args, int); break; + case CRAM_OPT_USE_TOK: + fd->use_tok = va_arg(args, int); + break; + + case CRAM_OPT_USE_FQZ: + fd->use_fqz = va_arg(args, int); + break; + + case CRAM_OPT_USE_ARITH: + fd->use_arith = va_arg(args, int); + break; + case CRAM_OPT_USE_LZMA: fd->use_lzma = va_arg(args, int); break; @@ -4888,8 +5048,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } if (!((major == 1 && minor == 0) || (major == 2 && (minor == 0 || minor == 1)) || - (major == 3 && minor == 0))) { - hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0"); + (major == 3 && (minor == 0 || minor == 1)))) { + hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0 or 3.1"); errno = EINVAL; return -1; } @@ -4897,6 +5057,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; + fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) >= 3 && + CRAM_MINOR_VERS(fd->version) >= 1)) ? 1 : 0; break; } @@ -4960,6 +5122,41 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->level = va_arg(args, int); break; + case HTS_OPT_PROFILE: { + enum hts_profile_option prof = va_arg(args, int); + switch (prof) { + case HTS_PROFILE_FAST: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; + fd->use_tok = 0; + fd->seqs_per_slice = 10000; + break; + + case HTS_PROFILE_NORMAL: + break; + + case HTS_PROFILE_SMALL: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->seqs_per_slice = 25000; + break; + + case HTS_PROFILE_ARCHIVE: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->use_arith = 1; + if (fd->level > 7) + fd->use_lzma = 1; + fd->seqs_per_slice = 100000; + break; + } + + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; + break; + } + default: hts_log_error("Unknown CRAM option code %d", opt); errno = EINVAL; diff --git a/cram/cram_io.h b/cram/cram_io.h index 194305398..3954a49af 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -467,6 +467,9 @@ int cram_uncompress_block(cram_block *b); */ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); cram_metrics *cram_new_metrics(void); char *cram_block_method2str(enum cram_block_method m); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ce7ad665a..1c51b09f7 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -194,15 +194,7 @@ struct cram_slice; /* Now in htslib/cram.h enum cram_block_method { - BM_ERROR = -1, - RAW = 0, - GZIP = 1, - BZIP2 = 2, - LZMA = 3, - RANS = 4, // Generic; either order - RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) - GZIP_RLE = 11, // NB: not externalised in CRAM + ... }; */ @@ -218,39 +210,29 @@ enum cram_content_type { }; */ +/* Maximum simultaneous codecs allowed, 1 per bit */ +#define CRAM_MAX_METHOD 32 + /* Compression metrics */ struct cram_metrics { // number of trials and time to next trial int trial; int next_trial; + int consistency; // aggregate sizes during trials - int sz_gz_rle; - int sz_gz_def; - int sz_rans0; - int sz_rans1; - int sz_bzip2; - int sz_lzma; + int sz[CRAM_MAX_METHOD]; // resultant method from trials - int method; + int method, revised_method; int strat; // Revisions of method, to allow culling of continually failing ones. - int gz_rle_cnt; - int gz_def_cnt; - int rans0_cnt; - int rans1_cnt; - int bzip2_cnt; - int lzma_cnt; - int revised_method; - - double gz_rle_extra; - double gz_def_extra; - double rans0_extra; - double rans1_extra; - double bzip2_extra; - double lzma_extra; + int cnt[CRAM_MAX_METHOD]; + + double extra[CRAM_MAX_METHOD]; + + cram_stats *stats; }; // Hash aux key (XX:i) to cram_metrics @@ -729,6 +711,9 @@ struct cram_fd { int use_bz2; int use_rans; int use_lzma; + int use_fqz; + int use_tok; + int use_arith; int shared_ref; unsigned int required_fields; int store_md; diff --git a/hts.c b/hts.c index 9e50290cf..8e4552f87 100644 --- a/hts.c +++ b/hts.c @@ -761,6 +761,34 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "USE_LZMA") == 0) o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val); + else if (strcmp(o->arg, "use_tok") == 0 || + strcmp(o->arg, "USE_TOK") == 0) + o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_fqz") == 0 || + strcmp(o->arg, "USE_FQZ") == 0) + o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_arith") == 0 || + strcmp(o->arg, "USE_ARITH") == 0) + o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val); + + else if (strcmp(o->arg, "fast") == 0 || + strcmp(o->arg, "FAST") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST; + + else if (strcmp(o->arg, "normal") == 0 || + strcmp(o->arg, "NORMAL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL; + + else if (strcmp(o->arg, "small") == 0 || + strcmp(o->arg, "SMALL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL; + + else if (strcmp(o->arg, "archive") == 0 || + strcmp(o->arg, "ARCHIVE") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE; + else if (strcmp(o->arg, "reference") == 0 || strcmp(o->arg, "REFERENCE") == 0) o->opt = CRAM_OPT_REFERENCE, o->val.s = val; @@ -1340,7 +1368,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; - return 0; + break; } case HTS_OPT_FILTER: { @@ -1350,6 +1378,28 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { return hts_set_filter_expression(fp, expr); } + case HTS_OPT_PROFILE: { + va_start(args, opt); + enum hts_profile_option prof = va_arg(args, int); + va_end(args); + if (fp->is_bgzf) { + switch (prof) { +#ifdef HAVE_LIBDEFLATE + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 2; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 10; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break; +#else + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 1; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 8; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 9; break; +#endif + } + } // else CRAM manages this in its own way + break; + } + default: break; } diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index fac3fd956..d947f985f 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,3 +1,8 @@ -HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c +HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ + $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ + $(HTSPREFIX)htscodecs/htscodecs/pack.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ + $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 8d5640842..f8d4d7d4a 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,6 +1,15 @@ HTSCODECS_SOURCES = HTSCODECS_OBJS = +htscodecs_arith_dynamic_h = +htscodecs_fqzcomp_qual_h = +htscodecs_pack_h = htscodecs_rANS_static_h = +htscodecs_rANS_static4x16_h = +htscodecs_tokenise_name3_h = +htscodecs_varint_h = htscodecs_rANS_byte_h = +htscodecs_c_range_coder_h = +htscodecs_c_simple_model_h = +htscodecs_pooled_alloc_h = diff --git a/htslib/cram.h b/htslib/cram.h index bbbabe82e..890896388 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -53,10 +53,46 @@ enum cram_block_method { GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, // Generic; either order - RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) + RANS = 4, RANS0 = RANS, + RANSPR = 5, RANS_PR0 = RANSPR, + ARITH = 6, ARITH_PR0 = ARITH, + FQZ = 7, + TOK3 = 8, NAME_TOK3 = TOK3, + + // Methods not externalised, but used in metrics. + // Externally they become one of the above methods. GZIP_RLE = 11, // NB: not externalised in CRAM + GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM + + FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods + + //RANS0, // Order 0 + RANS1, + + //RANS_PR0, // Order 0 + RANS_PR1, // Order 1 + RANS_PR64, // O0 + RLE + RANS_PR9, // O1 + X4 + RANS_PR128, // O0 + Pack + RANS_PR129, // O1 + Pack + RANS_PR192, // O0 + RLE + pack + RANS_PR193, // O1 + RLE + pack + + //NAME_TOK3, // tok+rans + NAME_TOKA, // tok+arith + + //ARITH_PR0, // Order 0 + ARITH_PR1, // Order 1 + ARITH_PR64, // O0 + RLE + ARITH_PR9, // O1 + X4 + ARITH_PR128, // O0 + Pack + ARITH_PR129, // O1 + Pack + ARITH_PR192, // O0 + RLE + pack + ARITH_PR193, // O1 + RLE + pack + + // NB: must end on no more than 31 unless we change to a + // 64-bit method type. + }; enum cram_content_type { @@ -306,6 +342,9 @@ int cram_uncompress_block(cram_block *b); HTSLIB_EXPORT int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); /**@}*/ /**@{ ---------------------------------------------------------------------- diff --git a/htslib/hts.h b/htslib/hts.h index f4e06efdd..3c6a3dcb6 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -316,6 +316,9 @@ enum hts_fmt_option { CRAM_OPT_STORE_MD, CRAM_OPT_STORE_NM, CRAM_OPT_RANGE_NOSEEK, // CRAM_OPT_RANGE minus the seek + CRAM_OPT_USE_TOK, + CRAM_OPT_USE_FQZ, + CRAM_OPT_USE_ARITH, // General purpose HTS_OPT_COMPRESSION_LEVEL = 100, @@ -324,6 +327,16 @@ enum hts_fmt_option { HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, HTS_OPT_FILTER, + HTS_OPT_PROFILE, +}; + +// Profile options for encoding; primarily used at present in CRAM +// but also usable in BAM as a synonym for deflate compression levels. +enum hts_profile_option { + HTS_PROFILE_FAST, + HTS_PROFILE_NORMAL, + HTS_PROFILE_SMALL, + HTS_PROFILE_ARCHIVE, }; // For backwards compatibility From a22e8ca60df122e23412dd255d3ffb4fc9f6a350 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 18 Sep 2019 16:34:34 +0100 Subject: [PATCH 037/114] Minor tweaks to codec learning params. --- cram/cram_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index f771d2496..06affc592 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -108,7 +108,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PATH_MAX FILENAME_MAX #endif -#define TRIAL_SPAN 50 +#define TRIAL_SPAN 70 #define NTRIALS 3 #define CRAM_DEFAULT_LEVEL 5 @@ -1573,18 +1573,18 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Externally defined methods 1, // 0 raw 1.04, // 1 gzip (Z_FILTERED) - 1.08, // 2 bzip2 - 1.04, // 3 lzma + 1.07, // 2 bzip2 + 1.08, // 3 lzma 1.00, // 4 rans (O0) 1.00, // 5 ranspr (O0) - 1.03, // 6 arithpr (O0) + 1.04, // 6 arithpr (O0) 1.05, // 7 fqz 1.05, // 8 tok3 (rans) 9, 9, // 9,10 reserved // Paramterised versions of above 1.01, // gzip rle - 1.02, // gzip -1 + 1.01, // gzip -1 1.05, 1.05, 1.05, // FQZ_b,c,d @@ -1745,7 +1745,7 @@ cram_metrics *cram_new_metrics(void) { if (!m) return NULL; m->trial = NTRIALS-1; - m->next_trial = TRIAL_SPAN; + m->next_trial = TRIAL_SPAN/2; // learn quicker at start m->method = RAW; m->strat = 0; m->revised_method = 0; From 9be58e5a68e01eddd679f41a95836d3a6ecae0b7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 18 Sep 2019 11:38:47 +0100 Subject: [PATCH 038/114] Ensure threads in the pool have a big enough stack Some rANS codecs require over 2 Mbytes of temporary space for the encode/decoder state. On some platforms, notably MacOS the default thread stack size is not big enough to hold this. Previously we fixed this by using malloc instead, but that has other undesireable side effects such as repeated mmap/munmap and wasting time zeroing pages over and over. Our traditional fix here has been to use pthread_once to get a single malloc and then reuse this block, to avoid the mmap issues in glibc. However this is complicated, ensuring the thread stack is sufficiently big is much easier. --- thread_pool.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/thread_pool.c b/thread_pool.c index 2bc427d51..f56b781b3 100644 --- a/thread_pool.c +++ b/thread_pool.c @@ -42,6 +42,10 @@ DEALINGS IN THE SOFTWARE. */ #include "thread_pool_internal.h" #include "htslib/hts_log.h" +// Minimum stack size for threads. Required for some rANS codecs +// that use over 2Mbytes of stack for encoder / decoder state +#define HTS_MIN_THREAD_STACK (3 * 1024 * 1024) + static void hts_tpool_process_detach_locked(hts_tpool *p, hts_tpool_process *q); @@ -716,6 +720,9 @@ static void wake_next_worker(hts_tpool_process *q, int locked) { */ hts_tpool *hts_tpool_init(int n) { int t_idx = 0; + size_t stack_size = 0; + pthread_attr_t pattr; + int pattr_init_done = 0; hts_tpool *p = malloc(sizeof(*p)); if (!p) return NULL; @@ -748,18 +755,32 @@ hts_tpool *hts_tpool_init(int n) { pthread_mutex_lock(&p->pool_m); + // Ensure new threads have a reasonably large stack. On some platforms, + // for example MacOS which defaults to 512Kb, this is not big enough + // for some of the rANS codecs. + + if (pthread_attr_init(&pattr) < 0) + goto cleanup; + pattr_init_done = 1; + if (pthread_attr_getstacksize(&pattr, &stack_size) < 0) + goto cleanup; + if (stack_size < HTS_MIN_THREAD_STACK) { + if (pthread_attr_setstacksize(&pattr, HTS_MIN_THREAD_STACK) < 0) + goto cleanup; + } + for (t_idx = 0; t_idx < n; t_idx++) { hts_tpool_worker *w = &p->t[t_idx]; p->t_stack[t_idx] = 0; w->p = p; w->idx = t_idx; pthread_cond_init(&w->pending_c, NULL); - if (0 != pthread_create(&w->tid, NULL, tpool_worker, w)) { + if (0 != pthread_create(&w->tid, &pattr, tpool_worker, w)) goto cleanup; - } } pthread_mutex_unlock(&p->pool_m); + pthread_attr_destroy(&pattr); return p; @@ -778,6 +799,8 @@ hts_tpool *hts_tpool_init(int n) { pthread_cond_destroy(&p->t[j].pending_c); } pthread_mutex_destroy(&p->pool_m); + if (pattr_init_done) + pthread_attr_destroy(&pattr); free(p->t_stack); free(p->t); free(p); From 4182364d6e79cd640a492968c54fa6f91d8bef29 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 23 Oct 2020 18:04:24 +0100 Subject: [PATCH 039/114] Add warning about unofficial status of CRAM 3.1. This cannot be avoided! It's spam, but it will be removed once the spec becomes official. If there are no changes, that does mean this htslib will be compliant, but spammy, however we can then make a new release. --- cram/cram_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cram/cram_io.c b/cram/cram_io.c index 06affc592..fa8b3e346 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5053,6 +5053,15 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { errno = EINVAL; return -1; } + + if (major > 3 || (major == 3 && minor > 0)) { + hts_log_warning( + "CRAM version %s is still in draft and is subject to\n" + "change. Please consider this a technology demonstration " + "and do not use for\n" + "long term archival of data.", s); + } + fd->version = major*256 + minor; fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; From 1e79b61467de646c63f614a599fcdba7a76bf03b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 19 Sep 2019 09:57:35 +0100 Subject: [PATCH 040/114] Add some very minimal 3.1 cram tests Not sure we have any files large enough to really put it through its paces, but this is a reasonable start. --- test/test.pl | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test/test.pl b/test/test.pl index 5db7b2320..823a414c5 100755 --- a/test/test.pl +++ b/test/test.pl @@ -582,6 +582,34 @@ sub test_view testv $opts, "./test_view $tv_args $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 3.1 support. + # SAM -> CRAM31u -> SAM + foreach my $profile (qw/fast normal small archive/) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=3.1 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + + # BAM -> CRAM31 -> BAM -> SAM + $cram = "$bam.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $bam > $cram"; + testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; + testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; + + # CRAM31 -> CRAM30 + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; + + # CRAM30 -> CRAM31 + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $cram.cram > $cram"; + + # CRAM31 -> CRAM31 + multi-slice + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; + testv $opts, "./test_view $tv_args $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; if (-e $jcram) { From 3e05b18bc8f9e74de3cfbb305358f3cc57b06d1e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 3 Feb 2020 12:42:07 +0000 Subject: [PATCH 041/114] Updated to v0.5 htscodecs --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 4e06c5d79..99ed6bcca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 4e06c5d795b2a603bebf141cb88082901b41399c +Subproject commit 99ed6bcca2192a3c210fac04939c7e51a0b15a3c From 0d1971cbebb269d90a563464e4f56987290e20ab Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 7 Feb 2020 15:28:37 +0000 Subject: [PATCH 042/114] First tranch of CRAM V4.0 support. We can read/write CRAM 3.1. We can read CRAM 4.0, but writing 4.0 is corrupted. --- Makefile | 8 +- cram/cram_codecs.c | 1751 +++++++++++++++++++++++++++++++++++------ cram/cram_codecs.h | 60 +- cram/cram_decode.c | 516 +++++++----- cram/cram_encode.c | 735 ++++++++--------- cram/cram_external.c | 10 +- cram/cram_io.c | 847 ++++++++++++++++---- cram/cram_io.h | 305 +------ cram/cram_structs.h | 50 +- hts.c | 2 +- htscodecs_bundled.mk | 1 + htscodecs_external.mk | 1 + htslib/cram.h | 12 +- test/test.pl | 28 + 14 files changed, 3013 insertions(+), 1313 deletions(-) diff --git a/Makefile b/Makefile index d6525a516..4168c0463 100644 --- a/Makefile +++ b/Makefile @@ -217,6 +217,7 @@ htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_rle_h = htscodecs/htscodecs/rle.h htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h @@ -378,12 +379,12 @@ probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h) -cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h) +cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) @@ -394,8 +395,9 @@ thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 80131b633..a598f3964 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -42,6 +42,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include + +#include "../htslib/hts_endian.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#include +#include +#else +#include "../htscodecs/htscodecs/varint.h" +#include "../htscodecs/htscodecs/pack.h" +#include "../htscodecs/htscodecs/rle.h" +#endif #include "cram.h" @@ -149,8 +162,8 @@ static void store_bytes_MSB(cram_block *block, char *bytes, int len) { #endif /* Local optimised copy for inlining */ -static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { - unsigned int val = 0; +static inline int64_t get_bits_MSB(cram_block *block, int nbits) { + uint64_t val = 0; int i; #if 0 @@ -239,7 +252,7 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { * characters with exactly the correct frequency distribution we check * for it elsewhere.) */ -static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { +static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) { //fprintf(stderr, " store_bits: %02x %d\n", val, nbits); /* @@ -248,15 +261,15 @@ static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { */ unsigned int mask; - if (block->byte+4 >= block->alloc) { + if (block->byte+8 >= block->alloc) { if (block->byte) { block->alloc *= 2; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; } else { block->alloc = 1024; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; block->data[0] = 0; // initialise first byte of buffer @@ -314,7 +327,6 @@ static char *cram_extract_block(cram_block *b, int size) { */ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int l; char *cp; cram_block *b; @@ -325,16 +337,36 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out); - b->idx += l; + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_external_decode_sint(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32s(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; } int cram_external_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int64_t l; char *cp; cram_block *b; @@ -345,11 +377,32 @@ int cram_external_decode_long(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); - b->idx += l; + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; +} + +int cram_external_decode_slong(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64s(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; } int cram_external_decode_char(cram_slice *slice, cram_codec *c, @@ -400,9 +453,26 @@ void cram_external_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_external_decode_init(char *data, int size, + +int cram_external_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.external.content_id); +} + +cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -415,15 +485,21 @@ cram_codec *cram_external_decode_init(char *data, int size, c->codec = E_EXTERNAL; if (option == E_INT) c->decode = cram_external_decode_int; + else if (option == E_SINT) + c->decode = cram_external_decode_sint; else if (option == E_LONG) c->decode = cram_external_decode_long; + else if (option == E_SLONG) + c->decode = cram_external_decode_slong; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_external_decode_char; else c->decode = cram_external_decode_block; c->free = cram_external_decode_free; + c->size = cram_external_decode_size; + c->get_block = cram_external_get_block; - cp += safe_itf8_get(cp, data + size, &c->u.external.content_id); + c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -441,16 +517,25 @@ cram_codec *cram_external_decode_init(char *data, int size, int cram_external_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1; +} - return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; +int cram_external_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1; } int cram_external_encode_long(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1; +} - ltf8_put_blk(c->out, *i64); - return 0; +int cram_external_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1; } int cram_external_encode_char(cram_slice *slice, cram_codec *c, @@ -470,7 +555,7 @@ void cram_external_encode_free(cram_codec *c) { int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - char tmp[99], *tp = tmp; + char tmp[99], *tp = tmp, *tpend = tmp+99; int len = 0, r = 0, n; if (prefix) { @@ -479,9 +564,9 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_external.content_id); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id); + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -495,7 +580,7 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, cram_codec *cram_external_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -505,13 +590,18 @@ cram_codec *cram_external_encode_init(cram_stats *st, c->free = cram_external_encode_free; if (option == E_INT) c->encode = cram_external_encode_int; + else if (option == E_SINT) + c->encode = cram_external_encode_sint; else if (option == E_LONG) c->encode = cram_external_encode_long; + else if (option == E_SLONG) + c->encode = cram_external_encode_slong; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->encode = cram_external_encode_char; else abort(); c->store = cram_external_encode_store; + c->flush = NULL; c->u.e_external.content_id = (size_t)dat; @@ -526,107 +616,1194 @@ int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char int64_t *out_i = (int64_t *)out; int i, n = *out_size; - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int i, n = *out_size; + + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + if (out) + for (i = 0; i < n; i++) + out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + else + for (i = 0; i < n; i++) + get_bits_MSB(in, c->u.beta.nbits); + } else { + if (out) + for (i = 0; i < n; i++) + out[i] = -c->u.beta.offset; + } + + return 0; +} + +void cram_beta_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BETA; + if (option == E_INT || option == E_SINT) + c->decode = cram_beta_decode_int; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_beta_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_beta_decode_char; + else { + hts_log_error("BYTE_ARRAYs not supported by this codec"); + free(c); + return NULL; + } + c->free = cram_beta_decode_free; + + c->u.beta.nbits = -1; + c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL); + if (cp < data + size) // Ensure test below works + c->u.beta.nbits = vv->varint_get32(&cp, data + size, NULL); + + if (cp - data != size + || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { + hts_log_error("Malformed beta header stream"); + free(c); + return NULL; + } + + return c; +} + +int cram_beta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset) + + c->vv->varint_size(c->u.e_beta.nbits))); + r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits)); r |= n; + + if (r > 0) return len; + + block_err: + return -1; +} + +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + unsigned char *syms = (unsigned char *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +void cram_beta_encode_free(cram_codec *c) { + if (c) free(c); +} + +cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + int min_val, max_val, len = 0; + int64_t range; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BETA; + c->free = cram_beta_encode_free; + if (option == E_INT || option == E_SINT) + c->encode = cram_beta_encode_int; + else if (option == E_LONG || option == E_SLONG) + c->encode = cram_beta_encode_long; + else + c->encode = cram_beta_encode_char; + c->store = cram_beta_encode_store; + c->flush = NULL; + + if (dat) { + min_val = ((int *)dat)[0]; + max_val = ((int *)dat)[1]; + } else { + min_val = INT_MAX; + max_val = INT_MIN; + int i; + for (i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (min_val > i) + min_val = i; + max_val = i; + } + if (st->h) { + khint_t k; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + i = kh_key(st->h, k); + if (min_val > i) + min_val = i; + if (max_val < i) + max_val = i; + } + } + } + + assert(max_val >= min_val); + c->u.e_beta.offset = -min_val; + range = (int64_t) max_val - min_val; + while (range) { + len++; + range >>= 1; + } + c->u.e_beta.nbits = len; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XPACK: Packing multiple values into a single byte. A fast transform that + * reduces time taken by entropy encoder and may also improve compression. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + if (cram_not_enough_bits(in, c->u.xpack.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; + + // get sub-codec data. + cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec); + if (!sub_b) + return -1; + + // Allocate local block to expand into + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + int n = sub_b->uncomp_size * 8/c->u.xpack.nbits; + BLOCK_GROW(b, n); + b->uncomp_size = n; + + uint8_t p[256]; + int z; + for (z = 0; z < 256; z++) + p[z] = c->u.xpack.rmap[z]; + hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size, + 8 / c->u.xpack.nbits, p); + + return 0; + + block_err: + return -1; +} + +int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // FIXME: we need to ban data-series interleaving in the spec for this to work. + + // Remember this may be called when threaded and multi-slice per container. + // Hence one cram_codec instance, multiple slices, multiple blocks. + // We therefore have to cache appropriate block info in slice and not codec. + // b = cram_get_block_by_id(slice, c->external.content_id); + if (c->u.xpack.nval > 1) { + cram_xpack_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (!b) + return -1; + + if (out) + memcpy(out, b->data + b->byte, *out_size); + b->byte += *out_size; + } else { + memset(out, c->u.xpack.rmap[0], *out_size); + } + + return 0; +} + +void cram_xpack_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xpack.sub_codec) + c->u.xpack.sub_codec->free(c->u.xpack.sub_codec); + + //free(slice->block_by_id[512 + c->codec_id]); + //slice->block_by_id[512 + c->codec_id] = 0; + + free(c); +} + +int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + if (option == E_LONG) + c->decode = cram_xpack_decode_long; + else if (option == E_INT) + c->decode = cram_xpack_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xpack_decode_char; + else { + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); + return NULL; + } + c->free = cram_xpack_decode_free; + c->size = cram_xpack_decode_size; + c->get_block = cram_xpack_get_block; + + c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); + c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); + int i; + for (i = 0; i < c->u.xpack.nval; i++) { + uint32_t v = vv->varint_get32(&cp, endp, NULL); + if (v >= 256) return NULL; + c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K + } + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xpack.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size + || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) { + malformed: + fprintf(stderr, "Malformed xpack header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_xpack_encode_flush(cram_codec *c) { + // Pack the buffered up data + int meta_len; + uint64_t out_len; + uint8_t out_meta[1024]; + uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out), + out_meta, &meta_len, &out_len); + + // We now need to pass this through the next layer of transform + if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming + c->u.e_xpack.sub_codec, + (char *)out, out_len)) + return -1; + + int r = 0; + if (c->u.e_xpack.sub_codec->flush) + r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec); + + free(out); + return r; +} + +int cram_xpack_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xpack.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + int len1 = 0, i; + for (i = 0; i < c->u.e_xpack.nval; i++) + len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n; + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits) + + c->vv->varint_size(c->u.e_xpack.nval) + + len1 + len2)); r |= n; + + // The map and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval)); r |= n; + for (i = 0; i < c->u.e_xpack.nval; i++) + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n; + + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xpack_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + BLOCK_APPEND(c->out, in, in_size); + return 0; + + block_err: + return -1; +} + +void cram_xpack_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xpack.sub_codec) + c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xpack_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + c->free = cram_xpack_encode_free; + if (option == E_LONG) + c->encode = cram_xpack_encode_long; + else if (option == E_INT) + c->encode = cram_xpack_encode_int; + else + c->encode = cram_xpack_encode_char; + c->store = cram_xpack_encode_store; + c->flush = cram_xpack_encode_flush; + + cram_xpack_encoder *e = (cram_xpack_encoder *)dat; + c->u.e_xpack.nbits = e->nbits; + c->u.e_xpack.nval = e->nval; + c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, e->sub_codec_dat, + version, vv); + + // Initialise fwd and rev maps + memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3 + int i, n; + for (i = n = 0; i < 256; i++) + if (e->map[i] != -1) + c->u.e_xpack.rmap[n++] = i; // 0,1,2,3 to P,A,C,K + if (n != e->nval) { + fprintf(stderr, "Incorrectly specified number of map items in PACK\n"); + return NULL; + } + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XDELTA: subtract successive values, zig-zag to turn +/- to + only, + * and then var-int encode the result. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ + +static uint8_t zigzag8 (int8_t x) { return (x << 1) ^ (x >> 7); } +static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); } +static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); } + +//static int8_t unzigzag8 (uint8_t x) { return (x >> 1) ^ -(x & 1); } +static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); } +static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); } + +int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // Slow value-by-value method for now + uint32_t *out32 = (uint32_t *)out; + int i; + for (i = 0; i < *out_size; i++) { + uint32_t v; + int one = 1; + if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in, + (char *)&v, &one) < 0) + return -1; + uint32_t d = unzigzag32(v); + c->u.xdelta.last = out32[i] = d + c->u.xdelta.last; + } + + return 0; +} + +static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) { + return -1; +} + +int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +static inline int16_t le_int2(int16_t i) { + int16_t s; + i16_to_le(i, (uint8_t *)&s); + return s; +} + +int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in, + char *out_, int *out_size) { + cram_block *out = (cram_block *)out_; + cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec); + int i = 0; + + const int w = c->u.xdelta.word_size; + uint32_t npad = (w - *out_size%w)%w; + uint32_t out_sz = *out_size + npad; + c->u.xdelta.last = 0; // reset for each new array + + for (i = 0; i < out_sz; i += w) { + uint16_t v; + // Need better interface + char *cp = (char *)b->data + b->byte; + char *cp_end = (char *)b->data + b->uncomp_size; + int err = 0; + v = c->vv->varint_get32(&cp, cp_end, &err); + if (err) + return -1; + b->byte = cp - (char *)b->data; + + switch(w) { + case 2: { + int16_t d = unzigzag16(v), z; + c->u.xdelta.last = d + c->u.xdelta.last; + z = le_int2(c->u.xdelta.last); + BLOCK_APPEND(out, &z, 2-npad); + npad = 0; + break; + } + default: + fprintf(stderr, "Unsupported word size by XDELTA\n"); + return -1; + } + } + + return 0; + + block_err: + return -1; +} + +void cram_xdelta_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xdelta.sub_codec) + c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec); + + free(c); +} + +int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + if (option == E_LONG) + c->decode = cram_xdelta_decode_long; + else if (option == E_INT) + c->decode = cram_xdelta_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xdelta_decode_char; + else if (option == E_BYTE_ARRAY_BLOCK) { + option = E_BYTE_ARRAY; + c->decode = cram_xdelta_decode_block; + } else + return NULL; + c->free = cram_xdelta_decode_free; + c->size = cram_xdelta_decode_size; + c->get_block = cram_xdelta_get_block; + + c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL); + c->u.xdelta.last = 0; + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xdelta.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size) { + malformed: + fprintf(stderr, "Malformed xdelta header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_xdelta_encode_flush(cram_codec *c) { + int r = -1; + cram_block *b = cram_new_block(0, 0); + if (!b) + return -1; + + switch (c->u.e_xdelta.word_size) { + case 2: { + // Delta + zigzag transform. + // Subtracting two 8-bit values has a 9-bit result (-255 to 255). + // However think of it as turning a wheel clockwise or anti-clockwise. + // If it has 256 gradations then a -ve rotation followed by a +ve + // rotation of the same amount reverses it regardless. + // + // Similarly the zig-zag transformation doesn't invent any extra bits, + // so the entire thing can be done in-situ. This may permit faster + // SIMD loops if we break apart the steps. + + // uint16_t last = 0, d; + // for (i = 0; i < n; i++) { + // d = io[i] - last; + // last = io[i]; + // io[i] = zigzag16(vd); + // } + + // --- vs --- + + // for (i = n-1; i >= 1; i--) + // io[i] -= io[i-1]; + // for (i = 0; i < n; i++) + // io[i] = zigzag16(io[i]); + + // varint: need array variant for speed here. + // With zig-zag + int i, n = BLOCK_SIZE(c->out)/2;; + uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0; + + if (n*2 < BLOCK_SIZE(c->out)) { + // half word + last = *(uint8_t *)dat; + c->vv->varint_put32_blk(b, zigzag16(last)); + dat = (uint16_t *)(((uint8_t *)dat)+1); + } + + for (i = 0; i < n; i++) { + uint16_t d = dat[i] - last; // possibly unaligned + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag16(d)); + } + + break; + } + + case 4: { + int i, n = BLOCK_SIZE(c->out)/4;; + uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag32(d)); + } + + break; + } + + case 1: { + int i, n = BLOCK_SIZE(c->out);; + uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag8(d)); + } + + break; + } + + default: + goto err; + } + + if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec, + (char *)b->data, b->byte)) + goto err; + + r = 0; + + err: + cram_free_block(b); + return r; + +} + +int cram_xdelta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xdelta.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size) + + len2)); r |= n; + + // This and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + char *dat = malloc(in_size*5), *cp = dat, *cp_end = dat + in_size*5; + if (!dat) + return -1; + + c->u.e_xdelta.last = 0; // reset for each new array + switch(c->u.e_xdelta.word_size) { + case 2: { + int i, part; + + part = in_size%2; + if (part) { + uint16_t z = in[0]; + c->u.e_xdelta.last = le_int2(z); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last)); + } + + uint16_t *in16 = (uint16_t *)(in+part); + for (i = 0; i < in_size/2; i++) { + uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last; + c->u.e_xdelta.last = le_int2(in16[i]); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(d)); + } + + break; + } + } + if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec, + (char *)dat, cp-dat)) { + free(dat); + return -1; + } + + free(dat); + return 0; +} + +void cram_xdelta_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xdelta.sub_codec) + c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xdelta_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + c->free = cram_xdelta_encode_free; + if (option == E_LONG) + c->encode = cram_xdelta_encode_long; + else if (option == E_INT) + c->encode = cram_xdelta_encode_int; + else + c->encode = cram_xdelta_encode_char; + c->store = cram_xdelta_encode_store; + c->flush = cram_xdelta_encode_flush; + + cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat; + c->u.e_xdelta.word_size = e->word_size; + c->u.e_xdelta.last = 0; + c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, + e->sub_codec_dat, + version, vv); + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XRLE + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +// Expands an XRLE transform and caches result in slice->block_by_id[] +static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec); + if (!lit_b) + return -1; + unsigned char *lit_dat = lit_b->data; + unsigned int lit_sz = lit_b->uncomp_size; + unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec); + + cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec); + if (!len_b) + return -1; + unsigned char *len_dat = len_b->data; + + uint8_t rle_syms[256]; + int rle_nsyms = 0; + int i; + for (i = 0; i < 256; i++) { + if (c->u.xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; } + uint64_t out_sz; + int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz); + if (!(b->data = malloc(out_sz))) + return -1; + rle_decode(lit_dat, lit_sz, + len_dat+nb, len_sz-nb, + rle_syms, rle_nsyms, + b->data, &out_sz); + b->uncomp_size = out_sz; + return 0; } -int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int32_t *out_i = (int32_t *)out; - int i, n = *out_size; +int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; +cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; - } +int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int n = *out_size; + + cram_xrle_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + memcpy(out, b->data + b->idx, n); + b->idx += n; return 0; -} -int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int i, n = *out_size; + // Old code when not cached + while (n > 0) { + if (c->u.xrle.cur_len == 0) { + unsigned char lit; + int one = 1; + if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in, + (char *)&lit, &one) < 0) + return -1; + c->u.xrle.cur_lit = lit; + if (c->u.xrle.rep_score[lit] > 0) { + if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in, + (char *)&c->u.xrle.cur_len, &one) < 0) + return -1; + } // else cur_len still zero + //else fprintf(stderr, "%d\n", lit); - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + c->u.xrle.cur_len++; + } - if (out) - for (i = 0; i < n; i++) - out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - else - for (i = 0; i < n; i++) - get_bits_MSB(in, c->u.beta.nbits); - } else { - if (out) - for (i = 0; i < n; i++) - out[i] = -c->u.beta.offset; + if (n >= c->u.xrle.cur_len) { + memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len); + out += c->u.xrle.cur_len; + n -= c->u.xrle.cur_len; + c->u.xrle.cur_len = 0; + } else { + memset(out, c->u.xrle.cur_lit, n); + out += n; + c->u.xrle.cur_len -= n; + n = 0; + } } return 0; } -void cram_beta_decode_free(cram_codec *c) { - if (c) - free(c); +void cram_xrle_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xrle.len_codec) + c->u.xrle.len_codec->free(c->u.xrle.len_codec); + + if (c->u.xrle.lit_codec) + c->u.xrle.lit_codec->free(c->u.xrle.lit_codec); + + free(c); } -cram_codec *cram_beta_decode_init(char *data, int size, +cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; + char *endp = data+size; + int err = 0; if (!(c = malloc(sizeof(*c)))) return NULL; - c->codec = E_BETA; - if (option == E_INT) - c->decode = cram_beta_decode_int; - else if (option == E_LONG) - c->decode = cram_beta_decode_long; + c->codec = E_XRLE; + if (option == E_LONG) + c->decode = cram_xrle_decode_long; + else if (option == E_INT) + c->decode = cram_xrle_decode_int; else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_beta_decode_char; + c->decode = cram_xrle_decode_char; else { - hts_log_error("BYTE_ARRAYs not supported by this codec"); - free(c); + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); return NULL; } - c->free = cram_beta_decode_free; - - c->u.beta.nbits = -1; - cp += safe_itf8_get(cp, data + size, &c->u.beta.offset); - if (cp < data + size) // Ensure test below works - cp += safe_itf8_get(cp, data + size, &c->u.beta.nbits); + c->free = cram_xrle_decode_free; + c->size = cram_xrle_decode_size; + c->get_block = cram_xrle_get_block; + c->u.xrle.cur_len = 0; + c->u.xrle.cur_lit = -1; - if (cp - data != size - || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { - hts_log_error("Malformed beta header stream"); - free(c); - return NULL; + // RLE map + int i, j, nrle = vv->varint_get32(&cp, endp, &err); + memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score)); + for (i = 0; i < nrle && i < 256; i++) { + j = vv->varint_get32(&cp, endp, &err); + if (j >= 0 && j < 256) + c->u.xrle.rep_score[j] = 1; } + // Length and literal sub encodings + c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err); + int sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding, + cp, sub_size, E_INT, version, vv); + if (c->u.xrle.len_codec == NULL) + goto malformed; + cp += sub_size; + + c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err); + sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding, + cp, sub_size, option, version, vv); + if (c->u.xrle.lit_codec == NULL) + goto malformed; + cp += sub_size; + + if (err) + goto malformed; + return c; + + malformed: + fprintf(stderr, "Malformed xrle header stream\n"); + free(c); + return NULL; } -int cram_beta_encode_store(cram_codec *c, cram_block *b, - char *prefix, int version) { +int cram_xrle_encode_flush(cram_codec *c) { + uint8_t *out_lit, *out_len; + uint64_t out_lit_size, out_len_size; + uint8_t rle_syms[256]; + int rle_nsyms = 0, i; + + for (i = 0; i < 256; i++) + if (c->u.e_xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; + + if (!c->u.e_xrle.to_flush) { + c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out); + c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out); + } + + out_len = malloc(c->u.e_xrle.to_flush_size+8); + if (!out_len) + return -1; + + int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size); + + out_lit = rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, + out_len+nb, &out_len_size, + rle_syms, &rle_nsyms, + NULL, &out_lit_size); + out_len_size += nb; + + + // TODO: can maybe "gift" the sub codec the data block, to remove + // one level of memcpy. + if (c->u.e_xrle.len_codec->encode(NULL, + c->u.e_xrle.len_codec, + (char *)out_len, out_len_size)) + return -1; + + if (c->u.e_xrle.lit_codec->encode(NULL, + c->u.e_xrle.lit_codec, + (char *)out_lit, out_lit_size)) + return -1; + + free(out_len); + free(out_lit); + + return 0; +} + +int cram_xrle_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { int len = 0, r = 0, n; + cram_codec *tc; + cram_block *b_rle, *b_len, *b_lit; if (prefix) { size_t l = strlen(prefix); @@ -634,118 +1811,134 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, len += l; } - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, itf8_size(c->u.e_beta.offset) - + itf8_size(c->u.e_beta.nbits))); // codec length - r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.offset)); r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.nbits)); r |= n; + // List of symbols to RLE + b_rle = cram_new_block(0, 0); + if (!b_rle) + return -1; + int i, nrle = 0, len1 = 0; + for (i = 0; i < 256; i++) { + if (c->u.e_xrle.rep_score[i] > 0) { + nrle++; + len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n; + } + } - if (r > 0) return len; + // Store length and literal sub-codecs to get encoded length + tc = c->u.e_xrle.len_codec; + b_len = cram_new_block(0, 0); + if (!b_len) + return -1; + int len2 = tc->store(tc, b_len, NULL, version); + + tc = c->u.e_xrle.lit_codec; + b_lit = cram_new_block(0, 0); + if (!b_lit) + return -1; + int len3 = tc->store(tc, b_lit, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3 + + c->vv->varint_size(nrle))); r |= n; + len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle)); + BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); + BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit)); + + cram_free_block(b_rle); + cram_free_block(b_len); + cram_free_block(b_lit); + + if (r > 0) + return len + len1 + len2 + len3; block_err: return -1; } -int cram_beta_encode_long(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int64_t *syms = (int64_t *)in; - int i, r = 0; - - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); +int cram_xrle_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; +} - return r; +int cram_xrle_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; } -int cram_beta_encode_int(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int *syms = (int *)in; - int i, r = 0; +int cram_xrle_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + if (c->u.e_xrle.to_flush) { + if (!c->out && !(c->out = cram_new_block(0, 0))) + return -1; + BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size); + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; + } - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->out && BLOCK_SIZE(c->out) > 0) { + // Gathering data + BLOCK_APPEND(c->out, in, in_size); + return 0; + } - return r; + // else cache copy of the data we're about to send to flush instead. + c->u.e_xrle.to_flush = in; + c->u.e_xrle.to_flush_size = in_size; + return 0; + + block_err: + return -1; } -int cram_beta_encode_char(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - unsigned char *syms = (unsigned char *)in; - int i, r = 0; +void cram_xrle_encode_free(cram_codec *c) { + if (!c) return; - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->u.e_xrle.len_codec) + c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec); + if (c->u.e_xrle.lit_codec) + c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec); - return r; -} + cram_free_block(c->out); -void cram_beta_encode_free(cram_codec *c) { - if (c) free(c); + free(c); } -cram_codec *cram_beta_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version) { +cram_codec *cram_xrle_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; - int min_val, max_val, len = 0; - int64_t range; - c = malloc(sizeof(*c)); - if (!c) + if (!(c = malloc(sizeof(*c)))) return NULL; - c->codec = E_BETA; - c->free = cram_beta_encode_free; - if (option == E_INT) - c->encode = cram_beta_encode_int; - else if (option == E_LONG) - c->encode = cram_beta_encode_long; - else - c->encode = cram_beta_encode_char; - c->store = cram_beta_encode_store; - if (dat) { - min_val = ((int *)dat)[0]; - max_val = ((int *)dat)[1]; - } else { - min_val = INT_MAX; - max_val = INT_MIN; - int i; - for (i = 0; i < MAX_STAT_VAL; i++) { - if (!st->freqs[i]) - continue; - if (min_val > i) - min_val = i; - max_val = i; - } - if (st->h) { - khint_t k; + c->codec = E_XRLE; + c->free = cram_xrle_encode_free; + if (option == E_LONG) + c->encode = cram_xrle_encode_long; + else if (option == E_INT) + c->encode = cram_xrle_encode_int; + else + c->encode = cram_xrle_encode_char; + c->store = cram_xrle_encode_store; + c->flush = cram_xrle_encode_flush; - for (k = kh_begin(st->h); k != kh_end(st->h); k++) { - if (!kh_exist(st->h, k)) - continue; + cram_xrle_encoder *e = (cram_xrle_encoder *)dat; - i = kh_key(st->h, k); - if (min_val > i) - min_val = i; - if (max_val < i) - max_val = i; - } - } - } + c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL, + E_BYTE, e->len_dat, + version, vv); + c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL, + E_BYTE, e->lit_dat, + version, vv); + c->u.e_xrle.cur_lit = -1; + c->u.e_xrle.cur_len = -1; + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; - assert(max_val >= min_val); - c->u.e_beta.offset = -min_val; - range = (int64_t) max_val - min_val; - while (range) { - len++; - range >>= 1; - } - c->u.e_beta.nbits = len; + memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score)); return c; } @@ -803,9 +1996,10 @@ void cram_subexp_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_subexp_decode_init(char *data, int size, +cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; @@ -822,8 +2016,8 @@ cram_codec *cram_subexp_decode_init(char *data, int size, c->free = cram_subexp_decode_free; c->u.subexp.k = -1; - cp += safe_itf8_get(cp, data + size, &c->u.subexp.offset); - cp += safe_itf8_get(cp, data + size, &c->u.subexp.k); + c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL); + c->u.subexp.k = vv->varint_get32(&cp, data + size, NULL); if (cp - data != size || c->u.subexp.k < 0) { hts_log_error("Malformed subexp header stream"); @@ -867,9 +2061,10 @@ void cram_gamma_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_gamma_decode_init(char *data, int size, +cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -888,7 +2083,7 @@ cram_codec *cram_gamma_decode_init(char *data, int size, c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; - cp += safe_itf8_get(cp, data + size, &c->u.gamma.offset); + c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -1082,9 +2277,10 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, /* * Initialises a huffman decoder from an encoding data stream. */ -cram_codec *cram_huffman_decode_init(char *data, int size, +cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { int32_t ncodes = 0, i, j; char *cp = data, *data_end = &data[size]; cram_codec *h; @@ -1092,14 +2288,14 @@ cram_codec *cram_huffman_decode_init(char *data, int size, int32_t val, last_len, max_len = 0; uint32_t max_val; // needs one more bit than val const int max_code_bits = sizeof(val) * 8 - 1; - int l; + int err = 0; if (option == E_BYTE_ARRAY_BLOCK) { hts_log_error("BYTE_ARRAYs not supported by this codec"); return NULL; } - cp += safe_itf8_get(cp, data_end, &ncodes); + ncodes = vv->varint_get32(&cp, data_end, &err); if (ncodes < 0) { hts_log_error("Invalid number of symbols in huffman stream"); return NULL; @@ -1129,21 +2325,26 @@ cram_codec *cram_huffman_decode_init(char *data, int size, /* Read symbols and bit-lengths */ if (option == E_LONG) { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_ltf8_get(cp, data_end, &codes[i].symbol); - } + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get64(&cp, data_end, &err); + } else if (option == E_SLONG) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get64s(&cp, data_end, &err); + } else if (option == E_INT || option == E_BYTE) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get32(&cp, data_end, &err); + } else if (option == E_SINT) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get32s(&cp, data_end, &err); } else { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - int32_t i32; - l = safe_itf8_get(cp, data_end, &i32); - codes[i].symbol = i32; - } + free(h); + return NULL; } - if (l < 1) + if (err) goto malformed; - cp += safe_itf8_get(cp, data_end, &i); + i = vv->varint_get32(&cp, data_end, &err); if (i != ncodes) goto malformed; @@ -1154,9 +2355,9 @@ cram_codec *cram_huffman_decode_init(char *data, int size, return h; } - for (i = 0, l = 1; i < ncodes; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].len); - if (l < 1) + for (i = 0; i < ncodes; i++) { + codes[i].len = vv->varint_get32(&cp, data_end, &err); + if (err) break; if (codes[i].len < 0) { hts_log_error("Huffman code length (%d) is negative", codes[i].len); @@ -1165,7 +2366,7 @@ cram_codec *cram_huffman_decode_init(char *data, int size, if (max_len < codes[i].len) max_len = codes[i].len; } - if (l < 1 || cp - data != size || max_len >= ncodes) + if (err || cp - data != size || max_len >= ncodes) goto malformed; /* 31 is max. bits available in val */ @@ -1229,12 +2430,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_long0; else h->decode = cram_huffman_decode_long; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT || option == E_BYTE) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else @@ -1389,7 +2590,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory */ char *tmp = malloc(6*c->u.e_huffman.nvals+16); - char *tp = tmp; + char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16; if (!tmp) return -1; @@ -1400,24 +2601,33 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_huffman.nvals); + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); if (c->u.e_huffman.option == E_LONG) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += ltf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put64(tp, tpend, codes[i].symbol); } - } else { + } else if (c->u.e_huffman.option == E_SLONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put32(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_SINT) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol); } + } else { + return -1; } - tp += itf8_put(tp, c->u.e_huffman.nvals); - for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].len); - } + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); + for (i = 0; i < c->u.e_huffman.nvals; i++) + tp += c->vv->varint_put32(tp, tpend, codes[i].len); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -1433,7 +2643,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, cram_codec *cram_huffman_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; int *new_vals, *new_freqs; int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; @@ -1594,18 +2804,21 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_long0; else c->encode = cram_huffman_encode_long; + } else { + return NULL; } c->store = cram_huffman_encode_store; + c->flush = NULL; return c; @@ -1658,14 +2871,13 @@ void cram_byte_array_len_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_len_decode_init(char *data, int size, +cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; char *endp = data + size; - int32_t encoding = 0; - int32_t sub_size = -1; if (!(c = malloc(sizeof(*c)))) return NULL; @@ -1676,23 +2888,22 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, c->u.byte_array_len.len_codec = NULL; c->u.byte_array_len.val_codec = NULL; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, - E_INT, version); + c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + E_INT, version, vv); if (c->u.byte_array_len.len_codec == NULL) goto no_codec; cp += sub_size; - sub_size = -1; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + encoding = vv->varint_get32(&cp, endp, NULL); + sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size, - option, version); + c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); if (c->u.byte_array_len.val_codec == NULL) goto no_codec; cp += sub_size; @@ -1760,8 +2971,8 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, len3 = tc->store(tc, b_val, NULL, version); if (len3 < 0) goto block_err; - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, len2+len3)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n; BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); @@ -1780,7 +2991,7 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, cram_codec *cram_byte_array_len_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat; @@ -1791,15 +3002,16 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, c->free = cram_byte_array_len_encode_free; c->encode = cram_byte_array_len_encode; c->store = cram_byte_array_len_encode_store; + c->flush = NULL; c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, st, E_INT, e->len_dat, - version); + version, vv); c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, NULL, E_BYTE_ARRAY, e->val_dat, - version); + version, vv); if (!c->u.e_byte_array_len.len_codec || !c->u.e_byte_array_len.val_codec) { @@ -1896,11 +3108,13 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_stop_decode_init(char *data, int size, +cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; unsigned char *cp = (unsigned char *)data; + int err = 0; if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2)) goto malformed; @@ -1929,11 +3143,10 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, + ((unsigned int) cp[3]<<24); cp += 4; } else { - cp += safe_itf8_get((char *) cp, data + size, - &c->u.byte_array_stop.content_id); + c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err); } - if ((char *)cp - data != size) + if ((char *)cp - data != size || err) goto malformed; return c; @@ -1971,19 +3184,20 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, len += l; } - cp += itf8_put(cp, c->codec); + cp += c->vv->varint_put32(cp, buf+20, c->codec); if (CRAM_MAJOR_VERS(version) == 1) { - cp += itf8_put(cp, 5); + cp += c->vv->varint_put32(cp, buf+20, 5); *cp++ = c->u.e_byte_array_stop.stop; *cp++ = (c->u.e_byte_array_stop.content_id >> 0) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 8) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff; } else { - cp += itf8_put(cp, 1 + itf8_size(c->u.e_byte_array_stop.content_id)); + cp += c->vv->varint_put32(cp, buf+20, 1 + + c->vv->varint_size(c->u.e_byte_array_stop.content_id)); *cp++ = c->u.e_byte_array_stop.stop; - cp += itf8_put(cp, c->u.e_byte_array_stop.content_id); + cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id); } BLOCK_APPEND(b, buf, cp-buf); @@ -1998,7 +3212,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -2008,6 +3222,7 @@ cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, c->free = cram_byte_array_stop_encode_free; c->encode = cram_byte_array_stop_encode; c->store = cram_byte_array_stop_encode_store; + c->flush = NULL; c->u.e_byte_array_stop.stop = ((int *)dat)[0]; c->u.e_byte_array_stop.content_id = ((int *)dat)[1]; @@ -2036,10 +3251,11 @@ const char *cram_encoding2str(enum cram_encoding t) { } } -static cram_codec *(*decode_init[])(char *data, +static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) = { + int version, varint_vec *vv) = { NULL, cram_external_decode_init, NULL, @@ -2052,12 +3268,18 @@ static cram_codec *(*decode_init[])(char *data, cram_gamma_decode_init, }; -cram_codec *cram_decoder_init(enum cram_encoding codec, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) { - return decode_init[codec](data, size, option, version); + cram_codec *r = decode_init[codec](hdr, data, size, option, version, vv); + if (r) { + r->vv = vv; + r->codec_id = hdr->ncodecs++; + } + return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); return NULL; @@ -2067,7 +3289,7 @@ cram_codec *cram_decoder_init(enum cram_encoding codec, static cram_codec *(*encode_init[])(cram_stats *stx, enum cram_external_type option, void *opt, - int version) = { + int version, varint_vec *vv) = { NULL, cram_external_encode_init, NULL, @@ -2084,14 +3306,19 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { if (st && !st->nvals) return NULL; if (encode_init[codec]) { cram_codec *r; - if ((r = encode_init[codec](st, option, dat, version))) + if ((r = encode_init[codec](st, option, dat, version, vv))) r->out = NULL; + if (!r) { + hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec)); + return NULL; + } + r->vv = vv; return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); @@ -2167,6 +3394,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; + else if (c->decode == cram_external_decode_block) + c->encode = cram_external_encode_char; else return -1; break; @@ -2223,6 +3452,26 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return -1; break; + case E_XPACK: { + // shares struct with decode + cram_codec t = *c; + t.free = cram_xpack_encode_free; + t.store = cram_xpack_encode_store; + if (t.decode == cram_xpack_decode_long) + t.encode = cram_xpack_encode_long; + else if (t.decode == cram_xpack_decode_int) + t.encode = cram_xpack_encode_int; + else if (t.decode == cram_xpack_decode_char) + t.encode = cram_xpack_encode_char; + else + return -1; + t.u.e_xpack.sub_codec = t.u.xpack.sub_codec; + if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1) + return -1; + *c = t; + break; + } + case E_BYTE_ARRAY_LEN: { cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 31a170031..850a2a92e 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2015, 2018 Genome Research Ltd. +Copyright (c) 2012-2015, 2018, 2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -73,6 +73,46 @@ typedef struct { int32_t nbits; } cram_beta_decoder; +// A PACK transform, packing multiple values into a single byte +typedef struct { + int32_t nbits; + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; + int nval; // number of items in maps + uint32_t rmap[256]; // 0,1,2,3 -> P,A,C,K + int map[256]; // P,A,C,K -> 0,1,2,3 // NB: max input is uint8_tb? Or use hash? +} cram_xpack_decoder; +typedef cram_xpack_decoder cram_xpack_encoder; + +// Transforms symbols X,Y,Z to bytes 0,1,2. +typedef struct { + enum cram_encoding len_encoding; + enum cram_encoding lit_encoding; + void *len_dat; + void *lit_dat; + struct cram_codec *len_codec; + struct cram_codec *lit_codec; + int cur_len; + int cur_lit; + int rep_score[256]; + char *to_flush; + size_t to_flush_size; +} cram_xrle_decoder; +typedef cram_xrle_decoder cram_xrle_encoder; + +// DELTA + zigzag + varint encoding +typedef struct { + // FIXME: define endian here too. Require little endian? + int64_t last; + uint8_t word_size; // 1, 2, 4, 8 + //uint8_t sign; // true if input data is already signed + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; +} cram_xdelta_decoder; +typedef cram_xdelta_decoder cram_xdelta_encoder; + typedef struct { int32_t offset; } cram_gamma_decoder; @@ -112,6 +152,8 @@ typedef struct { typedef struct cram_codec { enum cram_encoding codec; cram_block *out; + varint_vec *vv; + int codec_id; void (*free)(struct cram_codec *codec); int (*decode)(cram_slice *slice, struct cram_codec *codec, cram_block *in, char *out, int *out_size); @@ -119,6 +161,9 @@ typedef struct cram_codec { char *in, int in_size); int (*store)(struct cram_codec *codec, cram_block *b, char *prefix, int version); + int (*size)(cram_slice *slice, struct cram_codec *codec); + int (*flush)(struct cram_codec *codec); + cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec); union { cram_huffman_decoder huffman; @@ -128,23 +173,30 @@ typedef struct cram_codec { cram_subexp_decoder subexp; cram_byte_array_len_decoder byte_array_len; cram_byte_array_stop_decoder byte_array_stop; + cram_xpack_decoder xpack; + cram_xrle_decoder xrle; + cram_xdelta_decoder xdelta; cram_huffman_encoder e_huffman; cram_external_decoder e_external; cram_byte_array_stop_decoder e_byte_array_stop; cram_byte_array_len_encoder e_byte_array_len; cram_beta_decoder e_beta; + cram_xpack_decoder e_xpack; + cram_xrle_decoder e_xrle; + cram_xdelta_decoder e_xdelta; } u; } cram_codec; const char *cram_encoding2str(enum cram_encoding t); -cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version); + int version, varint_vec *vv); cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version); + int version, varint_vec *vv); //int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size); //void cram_decoder_free(void *codes); diff --git a/cram/cram_decode.c b/cram/cram_decode.c index ad09fb757..06e297331 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -66,12 +66,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Returns number of bytes decoded on success * -1 on failure */ -int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { +int cram_decode_TD(cram_fd *fd, char *cp, const char *endp, + cram_block_compression_hdr *h) { char *op = cp; unsigned char *dat; cram_block *b; int32_t blk_size = 0; - int nTL, i, sz; + int nTL, i, sz, err = 0; if (!(b = cram_new_block(0, 0))) return -1; @@ -85,14 +86,14 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { } /* Decode */ - cp += safe_itf8_get(cp, endp, &blk_size); + blk_size = fd->vv.varint_get32(&cp, endp, &err); if (!blk_size) { h->nTL = 0; cram_free_block(b); return cp - op; } - if (blk_size < 0 || endp - cp < blk_size) { + if (err || blk_size < 0 || endp - cp < blk_size) { cram_free_block(b); return -1; } @@ -143,7 +144,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block *b) { char *cp, *endp, *cp_copy; cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); - int i; + int i, err = 0; int32_t map_size = 0, map_count = 0; if (!hdr) @@ -160,27 +161,16 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { - int32_t i32; - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); -#else - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; -#endif - cp += safe_itf8_get(cp, endp, &hdr->num_records); - cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); + hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get64(&cp, endp, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get32(&cp, endp, &err); + } + hdr->num_records = fd->vv.varint_get32(&cp, endp, &err); + hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err); if (hdr->num_landmarks < 0 || hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) || endp - cp < hdr->num_landmarks) { @@ -191,9 +181,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, free(hdr); return NULL; } - for (i = 0; i < hdr->num_landmarks; i++) { - cp += safe_itf8_get(cp, endp, &hdr->landmark[i]); - } + for (i = 0; i < hdr->num_landmarks; i++) + hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);; } hdr->preservation_map = kh_init(map); @@ -211,11 +200,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, /* Initialise defaults for preservation map */ hdr->read_names_included = 0; hdr->AP_delta = 1; + hdr->qs_seq_orient = 1; memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); /* Preservation map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { pmap_t hd; khint_t k; @@ -269,6 +259,18 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, hdr->no_ref = !hd.i; break; + case CRAM_KEY('Q','O'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "QO", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->qs_seq_orient = hd.i; + break; + case CRAM_KEY('S','M'): if (endp - cp < 5) { cram_free_compression_header(hdr); @@ -311,7 +313,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, break; case CRAM_KEY('T','D'): { - int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary + int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary if (sz < 0) { cram_free_compression_header(hdr); return NULL; @@ -342,8 +344,9 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Record encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; for (i = 0; i < map_count; i++) { char *key = cp; int32_t encoding = E_NULL; @@ -359,8 +362,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } cp += 2; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); offset = cp - (char *)b->data; @@ -394,31 +397,20 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { ds_id = DS_AP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'R' && key[1] == 'G') { - ds_id = DS_RG; type = E_INT; + ds_id = DS_RG; + type = is_v4 ? E_SINT : E_INT; } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_LONG : E_INT; } else if (key[0] == 'T' && key[1] == 'S') { ds_id = DS_TS; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'N' && key[1] == 'F') { ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { @@ -471,8 +463,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, key); hdr->codecs[ds_id]->free(hdr->codecs[ds_id]); } - hdr->codecs[ds_id] = cram_decoder_init(encoding, cp, size, - type, fd->version); + hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size, + type, fd->version, &fd->vv); if (!hdr->codecs[ds_id]) { cram_free_compression_header(hdr); return NULL; @@ -502,13 +494,13 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Tag encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { int32_t encoding = E_NULL; int32_t size = 0; cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc - uint8_t *key; + uint8_t key[3]; if (!m || endp - cp < 6) { free(m); @@ -516,19 +508,19 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return NULL; } - key = (uint8_t *) cp + 1; - m->key = (key[0]<<16)|(key[1]<<8)|key[2]; - - cp += 4; // Strictly ITF8, but this suffices - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + m->key = fd->vv.varint_get32(&cp, endp, &err); + key[0] = m->key>>16; + key[1] = m->key>>8; + key[2] = m->key; + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); m->encoding = encoding; m->size = size; m->offset = cp - (char *)b->data; if (size < 0 || endp - cp < size || - !(m->codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY_BLOCK, fd->version))) { + !(m->codec = cram_decoder_init(hdr, encoding, cp, size, + E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) { cram_free_compression_header(hdr); free(m); return NULL; @@ -539,7 +531,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; } - if (cp - cp_copy != map_size) { + if (err || cp - cp_copy != map_size) { cram_free_compression_header(hdr); return NULL; } @@ -955,7 +947,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { cram_block_slice_hdr *hdr; unsigned char *cp; unsigned char *cp_end; - int i; + int i, err = 0; if (b->method != RAW) { /* Spec. says slice header should be RAW, but we can future-proof @@ -976,17 +968,14 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { hdr->content_type = b->content_type; if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); -#ifdef LARGE_POS - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); -#else - int32_t i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_start = i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_span = i32; -#endif + hdr->ref_seq_id = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + } if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) { free(hdr); hts_log_error("Negative values not permitted for header " @@ -994,19 +983,15 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } } - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); + hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); hdr->record_counter = 0; if (CRAM_MAJOR_VERS(fd->version) == 2) { - int32_t i32 = 0; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->record_counter = i32; + hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter); + hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); } - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks); - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids); + hdr->num_blocks = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); if (hdr->num_content_ids < 1 || hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) { /* Slice must have at least one data block, @@ -1020,20 +1005,18 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } - for (i = 0; i < hdr->num_content_ids; i++) { - int l = safe_itf8_get((char *)cp, (char *)cp_end, - &hdr->block_content_ids[i]); - if (l <= 0) { - free(hdr->block_content_ids); - free(hdr); - return NULL; - } - cp += l; + for (i = 0; i < hdr->num_content_ids; i++) + hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp, + (char *)cp_end, + &err); + if (err) { + free(hdr->block_content_ids); + free(hdr); + return NULL; } - if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id); - } + if (b->content_type == MAPPED_SLICE) + hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); if (CRAM_MAJOR_VERS(fd->version) != 1) { if (cp_end - cp < 16) { @@ -1046,7 +1029,11 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { memset(hdr->md5, 0, 16); } - return hdr; + if (!err) + return hdr; + + free(hdr); + return NULL; } @@ -1113,8 +1100,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t nm = 0; int32_t md_dist = 0; int orig_aux = 0; - int decode_md = s->decode_md && s->ref && !has_MD && cr->ref_id >= 0; - int decode_nm = s->decode_md && s->ref && !has_NM && cr->ref_id >= 0; + // CRAM < 4.0 decode_md is off/on + // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise) + int do_md = CRAM_MAJOR_VERS(fd->version) >= 4 + ? (s->decode_md > 0) + : (s->decode_md != 0); + int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0); + int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0); uint32_t ds = s->data_series; sam_hrecs_t *bfd = sh->hrecs; @@ -1127,7 +1119,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { orig_aux = BLOCK_SIZE(s->aux_blk); - BLOCK_APPEND(s->aux_blk, "MDZ", 3); + if (has_MD == 0) + BLOCK_APPEND(s->aux_blk, "MDZ", 3); } if (ds & CRAM_FN) { @@ -1807,32 +1800,67 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: - cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux; + size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux; + if (has_MD < 0) { + // has_MD < 0; already have MDZ allocated in aux at -has_MD, + // but wrote MD to end of aux (at orig_aux). + // We need some memmoves to shuffle it around. + char tmp_MD_[1024], *tmp_MD = tmp_MD_; + unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux; + if (sz > 1024) { + tmp_MD = malloc(sz); + if (!tmp_MD) + return -1; + } + memcpy(tmp_MD, orig_aux_p, sz); + memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz, + &BLOCK_DATA(s->aux_blk)[-has_MD], + orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]); + memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz); + if (tmp_MD != tmp_MD_) + free(tmp_MD); + + if (-has_NM > -has_MD) + // we inserted before NM, so move it up a bit + has_NM -= sz; + } + // else has_MD == 0 and we've already appended MD to the end. + + cr->aux_size += sz; } if (decode_nm) { - char buf[7]; - size_t buf_size; - buf[0] = 'N'; buf[1] = 'M'; - if (nm <= UINT8_MAX) { - buf_size = 4; - buf[2] = 'C'; - buf[3] = (nm>> 0) & 0xff; - } else if (nm <= UINT16_MAX) { - buf_size = 5; - buf[2] = 'S'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; + if (has_NM == 0) { + char buf[7]; + size_t buf_size; + buf[0] = 'N'; buf[1] = 'M'; + if (nm <= UINT8_MAX) { + buf_size = 4; + buf[2] = 'C'; + buf[3] = (nm>> 0) & 0xff; + } else if (nm <= UINT16_MAX) { + buf_size = 5; + buf[2] = 'S'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + } else { + buf_size = 7; + buf[2] = 'I'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + buf[5] = (nm>>16) & 0xff; + buf[6] = (nm>>24) & 0xff; + } + BLOCK_APPEND(s->aux_blk, buf, buf_size); + cr->aux_size += buf_size; } else { - buf_size = 7; - buf[2] = 'I'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; - buf[5] = (nm>>16) & 0xff; - buf[6] = (nm>>24) & 0xff; + // Preallocated space for NM at -has_NM into aux block + unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM; + buf[0] = (nm>> 0) & 0xff; + buf[1] = (nm>> 8) & 0xff; + buf[2] = (nm>>16) & 0xff; + buf[3] = (nm>>24) & 0xff; } - BLOCK_APPEND(s->aux_blk, buf, buf_size); - cr->aux_size += buf_size; } return r; @@ -1914,7 +1942,11 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, return -1; } -static int cram_decode_aux(cram_container *c, cram_slice *s, +// has_MD and has_NM are filled out with 0 for none present, +// 1 for present and verbatim, and -pos for present as placeholder +// (MD*, NM*) to be generated and filled out at offset +pos. +static int cram_decode_aux(cram_fd *fd, + cram_container *c, cram_slice *s, cram_block *blk, cram_record *cr, int *has_MD, int *has_NM) { int i, r = 0, out_sz = 1; @@ -1946,29 +1978,64 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, for (i = 0; i < cr->ntags; i++) { int32_t id, out_sz = 1; - unsigned char tag_data[3]; + unsigned char tag_data[7]; cram_map *m; if (TN[0] == 'M' && TN[1] == 'D' && has_MD) - *has_MD = 1; + *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1); if (TN[0] == 'N' && TN[1] == 'M' && has_NM) - *has_NM = 1; + *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);; //printf("Tag %d/%d\n", i+1, cr->ntags); - tag_data[0] = *TN++; - tag_data[1] = *TN++; - tag_data[2] = *TN++; + tag_data[0] = TN[0]; + tag_data[1] = TN[1]; + tag_data[2] = TN[2]; id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; - m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); - if (!m) - return -1; - BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') { + // Place holder, fill out contents later. + int tag_data_size; + if (TN[0] == 'N' && TN[1] == 'M') { + // Use a fixed size, so we can allocate room for it now. + memcpy(&tag_data[2], "I\0\0\0\0", 5); + tag_data_size = 7; + } else if (TN[0] == 'R' && TN[1] == 'G') { + // RG is variable size, but known already. Insert now + TN += 3; + // Equiv to fd->header->hrecs->rg[cr->rg], but this is the + // new header API equivalent. + const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg); + if (!rg) + continue; - if (!m->codec) return -1; - r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); - if (r) break; - cr->aux_size += out_sz + 3; + size_t rg_len = strlen(rg); + tag_data[2] = 'Z'; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + BLOCK_APPEND(s->aux_blk, rg, rg_len); + BLOCK_APPEND_CHAR(s->aux_blk, '\0'); + cr->aux_size += 3 + rg_len + 1; + cr->rg = -1; // prevents auto-add later + continue; + } else { + // Unknown size. We'll insert MD into stream later. + tag_data[2] = 'Z'; + tag_data_size = 3; + } + BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size); + cr->aux_size += tag_data_size; + TN += 3; + } else { + TN += 3; + m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); + if (!m) + return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + + if (!m->codec) return -1; + r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); + if (r) break; + cr->aux_size += out_sz + 3; + } } return r; @@ -2011,7 +2078,7 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { * Or do we just admit defeat and output 0 for tlen? It's the * safe option... */ - if (cr->tlen == INT_MIN) { + if (cr->tlen == INT64_MIN) { int id1 = rec, id2 = rec; int64_t aleft = cr->apos, aright = cr->aend; int64_t tlen; @@ -2120,9 +2187,16 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { cr->mate_ref_id = -1; } - if (cr->tlen == INT_MIN) - cr->tlen = 0; // Just in case + if (cr->tlen == INT64_MIN) + cr->tlen = 0; // Just incase } + + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + if (cr->explicit_tlen != INT64_MIN) + cr->tlen = cr->explicit_tlen; + } + return 0; } @@ -2137,6 +2211,31 @@ static char *md5_print(unsigned char *md5, char *out) { return out; } +/* + * Utility function to decode tlen (ISIZE), as it's called + * in multiple places. + * + * Returns codec return value (0 on success). + */ +static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s, + cram_block *blk, int64_t *tlen) { + int out_sz = 1, r = 0; + + if (!c->comp_hdr->codecs[DS_TS]) return -1; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + *tlen = i32; + } else { + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)tlen, &out_sz); + } + return r; +} + /* * Decode an entire slice from container blocks. Fills out s->crecs[] array. * Returns 0 on success @@ -2454,18 +2553,18 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&cr->apos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&i32, &out_sz); - cr->apos = i32; -#endif - if (r) goto block_err; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&cr->apos, &out_sz); + } else { + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; + } + if (r) goto block_err;; if (c->comp_hdr->AP_delta) cr->apos += s->last_apos; s->last_apos= cr->apos; @@ -2505,6 +2604,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->mate_pos = 0; cr->mate_line = -1; cr->mate_ref_id = -1; + cr->explicit_tlen = INT64_MIN; if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { if (ds & CRAM_MF) { if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -2560,39 +2660,30 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // } if (ds & CRAM_NP) { - if (!c->comp_hdr->codecs[DS_NP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&cr->mate_pos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&i32, &out_sz); - cr->mate_pos = i32; -#endif + if (!c->comp_hdr->codecs[DS_NP]) goto block_err;; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; + } else { + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&cr->mate_pos, &out_sz); + } if (r) goto block_err; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&cr->tlen, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&i32, &out_sz); - cr->tlen = i32; -#endif + r = cram_decode_tlen(fd, c, s, blk, &cr->tlen); if (r) goto block_err; } else { - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { + // else not detached if (ds & CRAM_NF) { if (!c->comp_hdr->codecs[DS_NF]) goto block_err; r |= c->comp_hdr->codecs[DS_NF] @@ -2606,15 +2697,32 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, //dstring_nappend(name_ds, name, cr->name_len); cr->mate_ref_id = -1; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; cr->mate_pos = 0; } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; + } + if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; + } + } + } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; } } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } /* else if (!name[0]) { @@ -2634,7 +2742,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (CRAM_MAJOR_VERS(fd->version) == 1) r |= cram_decode_aux_1_0(c, s, blk, cr); else - r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM); + r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM); if (r) goto block_err; /* Fake up dynamic string growth and appending */ @@ -2705,6 +2813,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, memset(qual, 255, cr->len); } } + + if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = qual[i]; + qual[i] = qual[j]; + qual[j] = c; + } + } } pthread_mutex_lock(&fd->ref_lock); @@ -2853,18 +2971,28 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, name_len = cr->name_len; } else { name = name_a; - name_len = strlen(fd->prefix); - memcpy(name, fd->prefix, name_len); - name += name_len; - *name++ = ':'; - if (cr->mate_line >= 0 && cr->mate_line < rec) - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - cr->mate_line + 1); - else - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - rec + 1); + if (cr->mate_line >= 0 && cr->mate_line < s->max_rec && + s->crecs[cr->mate_line].name_len > 0) { + // Copy our mate if non-zero. + memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name, + s->crecs[cr->mate_line].name_len); + name = name_a + s->crecs[cr->mate_line].name_len; + } else { + // Otherwise generate a name based on prefix + name_len = strlen(fd->prefix); + memcpy(name, fd->prefix, name_len); + name += name_len; + *name++ = ':'; + if (cr->mate_line >= 0 && cr->mate_line < rec) { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + cr->mate_line + 1); + } else { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + rec + 1); + } + } name_len = name - name_a; name = name_a; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 7d2f5a9a5..cdd73b8a7 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -95,27 +95,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ - // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { r |= itf8_put_blk(cb, h->ref_seq_id); -#ifdef LARGE_POS - r |= ltf8_put_blk(cb, h->ref_seq_start); - r |= ltf8_put_blk(cb, h->ref_seq_span); -#else r |= itf8_put_blk(cb, h->ref_seq_start); r |= itf8_put_blk(cb, h->ref_seq_span); -#endif r |= itf8_put_blk(cb, h->num_records); r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { @@ -128,7 +112,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, /* Create in-memory preservation map */ /* FIXME: should create this when we create the container */ - { + if (h->num_records > 0) { khint_t k; int r; @@ -166,6 +150,12 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, if (-1 == r) return NULL; kh_val(h->preservation_map, k).i = h->AP_delta; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + k = kh_put(map, h->preservation_map, "QO", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = h->qs_seq_orient; + } + if (fd->no_ref || fd->embed_ref) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); @@ -196,26 +186,12 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, switch(CRAM_KEY(key[0], key[1])) { case CRAM_KEY('M','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('U','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('P','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('A','P'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','N'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','R'): + case CRAM_KEY('Q','O'): BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); break; @@ -251,7 +227,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } case CRAM_KEY('T','D'): { - r |= itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); + r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0); BLOCK_APPEND(map, BLOCK_DATA(h->TD_blk), BLOCK_SIZE(h->TD_blk)); @@ -266,8 +242,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, mc++; } } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* rec encoding map */ @@ -467,33 +443,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, return NULL; mc++; } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* tag encoding map */ -#if 0 - mp = map; mc = 0; - if (h->tag_encoding_map) { - HashItem *hi; - HashIter *iter = HashTableIterCreate(); - if (!iter) - return NULL; - - while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { - cram_map *m = hi->data.p; - int sz; - - mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); - if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) - return NULL; - mp += sz; - mc++; - } - - HashTableIterDestroy(iter); - } -#else mc = 0; BLOCK_SIZE(map) = 0; if (c->tags_used) { @@ -507,16 +461,16 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, key = kh_key(c->tags_used, k); cram_codec *cd = kh_val(c->tags_used, k)->codec; - r |= itf8_put_blk(map, key); + r |= (fd->vv.varint_put32_blk(map, key) <= 0); if (-1 == cd->store(cd, map, NULL, fd->version)) return NULL; mc++; } } -#endif - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb)); @@ -548,38 +502,38 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { if (!b) return NULL; - cp = buf = malloc(16+5*(8+s->hdr->num_blocks)); + cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks)); if (NULL == buf) { cram_free_block(b); return NULL; } - cp += itf8_put(cp, s->hdr->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put(cp, s->hdr->ref_seq_start); - cp += ltf8_put(cp, s->hdr->ref_seq_span); -#else - cp += itf8_put(cp, s->hdr->ref_seq_start); - cp += itf8_put(cp, s->hdr->ref_seq_span); -#endif - cp += itf8_put(cp, s->hdr->num_records); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span); + } + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) - cp += itf8_put(cp, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter); else if (CRAM_MAJOR_VERS(fd->version) >= 3) - cp += ltf8_put(cp, s->hdr->record_counter); - cp += itf8_put(cp, s->hdr->num_blocks); - cp += itf8_put(cp, s->hdr->num_content_ids); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids); for (j = 0; j < s->hdr->num_content_ids; j++) { - cp += itf8_put(cp, s->hdr->block_content_ids[j]); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]); } if (s->hdr->content_type == MAPPED_SLICE) - cp += itf8_put(cp, s->hdr->ref_base_id); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id); if (CRAM_MAJOR_VERS(fd->version) != 1) { memcpy(cp, s->hdr->md5, 16); cp += 16; } - assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); + assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks)); b->data = (unsigned char *)buf; b->comp_size = b->uncomp_size = cp-buf; @@ -602,6 +556,7 @@ static int cram_encode_slice_read(cram_fd *fd, int64_t *last_pos) { int r = 0; int32_t i32; + int64_t i64; unsigned char uc; //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); @@ -620,24 +575,22 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } *last_pos = cr->apos; } else { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -649,23 +602,30 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); -#ifdef LARGE_POS - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&cr->mate_pos, 1); - - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&cr->tlen, 1); -#else - i32 = cr->mate_pos; - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&i32, 1); - i32 = cr->tlen; - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&i32, 1); -#endif - } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { - r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], - (char *)&cr->mate_line, 1); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&cr->mate_pos, 1); + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } else { + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); + } + } else { + if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { + r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], + (char *)&cr->mate_line, 1); + } + if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } + } } /* Aux tags */ @@ -716,7 +676,7 @@ static int cram_encode_slice_read(cram_fd *fd, // BLOCK_DATA(s->soft_blk) + f->S.seq_idx, // f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, // f->S.len); @@ -726,7 +686,7 @@ static int cram_encode_slice_read(cram_fd *fd, //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], // seq, f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, // f->I.len); @@ -822,6 +782,7 @@ static int cram_encode_slice_read(cram_fd *fd, static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { int level = fd->level, i; int method = 1<version >= (3<<8)+1); /* Compress the CORE Block too, with minimal zlib level */ if (level > 5 && s->block[0]->uncomp_size > 500) @@ -843,7 +804,6 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { method_ranspr |= (1<version >= (3<<8)+1); if (fd->use_rans) { methodF |= v31_or_above ? method_ranspr : method_rans; method |= v31_or_above ? method_ranspr : method_rans; @@ -856,7 +816,7 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { method_arith |= (1<use_arith && v31_or_above) { methodF |= method_arith; @@ -893,6 +853,11 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { } } + pthread_mutex_lock(&fd->metrics_lock); + for (i = 0; i < DS_END; i++) + fd->m[i]->stats = c->stats[i]; + pthread_mutex_unlock(&fd->metrics_lock); + /* Specific compression methods for certain block types */ if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq) method, level)) @@ -949,13 +914,13 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { // NAME: best is generally xz, bzip2, zlib then rans1 int method_rn = method & ~(method_rans | method_ranspr | 1<version >= (3<<8)+1 && fd->use_tok) - method_rn |= fd->use_arith ? (1<use_arith ? (1<block[DS_RN], fd->m[DS_RN], method_rn, level)) return -1; // NS shows strong local correlation as rearrangements are localised - if (s->block[DS_NS] != s->block[0]) + if (s->block[DS_NS] && s->block[DS_NS] != s->block[0]) if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS], method, level)) return -1; @@ -1000,6 +965,98 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { return 0; } +/* + * Allocates a block associated with the cram codec associated with + * data series ds_id or the internal codec_id (depending on codec + * type). + * + * The ds_ids are what end up written to disk as an external block. + * The c_ids are internal and used when daisy-chaining transforms + * such as MAP and RLE. These blocks are also allocated, but + * are ephemeral in nature. (The codecs themselves cannot allocate + * these as the same codec pointer may be operating on multiple slices + * if we're using a multi-slice container.) + * + * Returns 0 on success + * -1 on failure + */ +static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { + if (!codec) + return 0; + + switch(codec->codec) { + // Codecs which are hard-coded to use the CORE block + case E_GOLOMB: + case E_HUFFMAN: + case E_BETA: + case E_SUBEXP: + case E_GOLOMB_RICE: + case E_GAMMA: + codec->out = s->block[0]; + break; + + // Codecs that emit directly to external blocks + case E_EXTERNAL: + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.external.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + case E_BYTE_ARRAY_STOP: // Why no sub-codec? + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.byte_array_stop.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + + // Codecs that contain sub-codecs which may in turn emit to external blocks + case E_BYTE_ARRAY_LEN: { + cram_codec *bal = codec->u.e_byte_array_len.len_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + bal = codec->u.e_byte_array_len.val_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + + break; + } + + case E_XRLE: + if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id)) + //ds_id == DS_QS ? DS_QS_len : ds_id)) + return -1; + if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id)) + return -1; + + break; + + case E_XPACK: + if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + case E_XDELTA: + if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + default: + break; + } + + return 0; +} + /* * Encodes a single slice from a container * @@ -1032,7 +1089,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, c->num_records += s->hdr->num_records; int ntags = c->tags_used ? c->tags_used->n_occupied : 0; - s->block = calloc(DS_END + ntags, sizeof(s->block[0])); + s->block = calloc(DS_END + ntags*2, sizeof(s->block[0])); s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); if (!s->block || !s->hdr->block_content_ids) return -1; @@ -1065,55 +1122,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, /* * All the data-series blocks if appropriate. */ - for (id = DS_BF; id < DS_TN; id++) { - if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || - h->codecs[id]->codec == E_BYTE_ARRAY_STOP || - h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { - switch (h->codecs[id]->codec) { - case E_EXTERNAL: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.external.content_id = id; - break; - - case E_BYTE_ARRAY_STOP: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.byte_array_stop.content_id = id; - break; - - case E_BYTE_ARRAY_LEN: { - cram_codec *cc; - - cc = h->codecs[id]->u.e_byte_array_len.len_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - - cc = h->codecs[id]->u.e_byte_array_len.val_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!s->block[eid]) - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - break; - } - default: - break; - } - } else { - if (!(id == DS_BB && !h->codecs[DS_BB])) - s->block[id] = s->block[0]; - } - if (h->codecs[id]) - h->codecs[id]->out = s->block[id]; + for (id = DS_QS; id < DS_TN; id++) { + if (cram_allocate_block(h->codecs[id], s, id) < 0) + return -1; } /* @@ -1138,11 +1149,21 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, s->block[0]->comp_size = s->block[0]->uncomp_size; // Make sure the fixed blocks point to the correct sources + if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]); s->block[DS_IN] = s->base_blk; s->base_blk = NULL; + if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]); s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; + if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]); s->block[DS_RN] = s->name_blk; s->name_blk = NULL; + if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]); s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; + // Finalise any data transforms. + for (id = DS_QS; id < DS_TN; id++) { + if (h->codecs[id] && h->codecs[id]->flush) + h->codecs[id]->flush(h->codecs[id]); + } + // Ensure block sizes are up to date. for (id = 1; id < s->hdr->num_blocks; id++) { if (!s->block[id] || s->block[id] == s->block[0]) @@ -1338,12 +1359,21 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, cr->name = BLOCK_SIZE(s->name_blk); if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) { - BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); - cr->name_len = bam_name_len(b); + if (CRAM_MAJOR_VERS(fd->version) >= 4 + && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) + && cr->mate_line) { + // Dedup read names in V4 + BLOCK_APPEND(s->name_blk, "\0", 1); + cr->name_len = 1; + } else { + BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); + cr->name_len = bam_name_len(b); + } } else { // Can only discard duplicate names if not detached cr->name_len = 0; } + if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0) goto block_err; } @@ -1367,6 +1397,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int r1, r2, sn, nref; spare_bams *spares; + if (CRAM_MAJOR_VERS(fd->version) == 1) + goto err; + //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} #define goto_err goto err @@ -1491,7 +1524,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // slice can start aggregating them from the start again. if (c->tags_used->n_occupied) { int ntags = c->tags_used->n_occupied; - s->aux_block = calloc(ntags, sizeof(*s->aux_block)); + s->aux_block = calloc(ntags*2, sizeof(*s->aux_block)); if (!s->aux_block) return -1; @@ -1507,8 +1540,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (!tm->blk) continue; s->aux_block[s->naux_block++] = tm->blk; tm->blk = NULL; + if (!tm->blk2) continue; + s->aux_block[s->naux_block++] = tm->blk2; + tm->blk2 = NULL; } - assert(s->naux_block <= c->tags_used->n_occupied); + assert(s->naux_block <= 2*c->tags_used->n_occupied); } } @@ -1544,6 +1580,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Compute MD5s */ + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; @@ -1570,13 +1607,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BF ===\n"); h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), c->stats[DS_BF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err; //fprintf(stderr, "=== CF ===\n"); h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), c->stats[DS_CF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err; //fprintf(stderr, "=== RN ===\n"); @@ -1588,105 +1625,104 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (c->pos_sorted) { h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), c->stats[DS_AP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_SLONG : E_INT, + NULL, fd->version, &fd->vv); } else { int p[2] = {0, c->max_apos}; - h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, - fd->version); + h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, + is_v4 ? E_SLONG : E_INT, + p, fd->version, &fd->vv); +// cram_xdelta_encoder e; +// e.word_size = is_v4 ? 8 : 4; +// e.sub_encoding = E_EXTERNAL; +// e.sub_codec_dat = (void *)DS_AP; +// +// h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL, +// is_v4 ? E_LONG : E_INT, +// &e, fd->version, &fd->vv); } if (!h->codecs[DS_AP]) goto_err; //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), - c->stats[DS_RG], E_INT, NULL, - fd->version); + c->stats[DS_RG], + is_v4 ? E_SINT : E_INT, + NULL, + fd->version, &fd->vv); if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; //fprintf(stderr, "=== MQ ===\n"); h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), c->stats[DS_MQ], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err; //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), c->stats[DS_NS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; //fprintf(stderr, "=== MF ===\n"); h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), c->stats[DS_MF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err; //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), c->stats[DS_TS], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_SLONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), c->stats[DS_NP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; //fprintf(stderr, "=== NF ===\n"); h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), c->stats[DS_NF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err; //fprintf(stderr, "=== RL ===\n"); h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), c->stats[DS_RL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err; //fprintf(stderr, "=== FN ===\n"); h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), c->stats[DS_FN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err; //fprintf(stderr, "=== FC ===\n"); h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), c->stats[DS_FC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err; //fprintf(stderr, "=== FP ===\n"); h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), c->stats[DS_FP], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err; //fprintf(stderr, "=== DL ===\n"); h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), c->stats[DS_DL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err; //fprintf(stderr, "=== BA ===\n"); h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), c->stats[DS_BA], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err; if (CRAM_MAJOR_VERS(fd->version) >= 3) { @@ -1701,7 +1737,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_BB]) goto_err; } else { h->codecs[DS_BB] = NULL; @@ -1710,7 +1746,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BS ===\n"); h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), c->stats[DS_BS], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -1724,13 +1760,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TC ===\n"); h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), c->stats[DS_TC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err; //fprintf(stderr, "=== TN ===\n"); h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), c->stats[DS_TN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err; } else { h->codecs[DS_TC] = NULL; @@ -1739,32 +1775,32 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TL ===\n"); h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), c->stats[DS_TL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err; //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), c->stats[DS_RI], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; //fprintf(stderr, "=== RS ===\n"); h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), c->stats[DS_RS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err; //fprintf(stderr, "=== PD ===\n"); h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), c->stats[DS_PD], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err; //fprintf(stderr, "=== HC ===\n"); h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), c->stats[DS_HC], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err; //fprintf(stderr, "=== SC ===\n"); @@ -1773,7 +1809,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); } else { // Appears to be no practical benefit to using this method, // but it may work better if we start mixing SC, IN and BB @@ -1788,7 +1824,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); } if (!h->codecs[DS_SC]) goto_err; } @@ -1798,19 +1834,19 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int i2[2] = {0, DS_IN}; h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_IN]) goto_err; } h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, (void *)DS_QS, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_QS]) goto_err; { int i2[2] = {0, DS_RN}; h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_RN]) goto_err; } @@ -1829,7 +1865,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_start = c->ref_seq_start; h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; - h->AP_delta = c->pos_sorted; + h->qs_seq_orient = c->qs_seq_orient; + h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); if (!(c_hdr = cram_encode_compression_header(fd, c, h))) @@ -1852,9 +1889,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { ? c_hdr->uncomp_size : c_hdr->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(c_hdr->content_id) + - itf8_size(c_hdr->comp_size) + - itf8_size(c_hdr->uncomp_size); + fd->vv.varint_size(c_hdr->content_id) + + fd->vv.varint_size(c_hdr->comp_size) + + fd->vv.varint_size(c_hdr->uncomp_size); } c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; @@ -1877,15 +1914,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { : s->hdr_block->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->hdr_block->content_id) + - itf8_size(s->hdr_block->comp_size) + - itf8_size(s->hdr_block->uncomp_size); + fd->vv.varint_size(s->hdr_block->content_id) + + fd->vv.varint_size(s->hdr_block->comp_size) + + fd->vv.varint_size(s->hdr_block->uncomp_size); for (j = 0; j < s->hdr->num_blocks; j++) { slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->block[j]->content_id) + - itf8_size(s->block[j]->comp_size) + - itf8_size(s->block[j]->uncomp_size); + fd->vv.varint_size(s->block[j]->content_id) + + fd->vv.varint_size(s->block[j]->comp_size) + + fd->vv.varint_size(s->block[j]->uncomp_size); slice_offset += s->block[j]->method == RAW ? s->block[j]->uncomp_size @@ -2127,147 +2164,6 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, return -1; } -/* - * Encodes auxiliary data, CRAM 1.0 format. - * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) - */ -static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr) { - char *aux, *tmp, *rg = NULL; - int aux_size = bam_blk_size(b) - - ((char *)bam_aux(b) - (char *)&bam_ref(b)); - - /* Worst case is 1 nul char on every ??:Z: string, so +33% */ - BLOCK_GROW(s->aux_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_blk); - - aux = (char *)bam_aux(b); - cr->TN_idx = s->nTN; - - while (aux[0] != 0) { - int32_t i32; - int r; - - if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; - } - if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - while (*aux++); - continue; - } - if (aux[0] == 'N' && aux[1] == 'M') { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - hts_log_error("Unhandled type code for NM tag"); - return NULL; - } - continue; - } - - cr->ntags++; - - i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; - kh_put(m_tagmap, c->tags_used, i32, &r); - if (-1 == r) - return NULL; - - if (s->nTN >= s->aTN) { - s->aTN = s->aTN ? s->aTN*2 : 1024; - if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) - return NULL; - } - s->TN[s->nTN++] = i32; - if (cram_stats_add(c->stats[DS_TN], i32) < 0) - goto block_err; - - switch(aux[2]) { - case 'A': case 'C': case 'c': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; - break; - - case 'S': case 's': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'I': case 'i': case 'f': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'd': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'Z': case 'H': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; // stop byte - break; - - case 'B': { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - // skip TN field - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - hts_log_error("Unknown sub-type '%c' for aux type 'B'", type); - return NULL; - } - - tmp += itf8_put(tmp, blen+5); - - *tmp++=*aux++; // sub-type & length - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - //cram_stats_add(c->aux_B_stats, blen); - break; - } - default: - hts_log_error("Unknown aux type '%c'", aux[2]); - return NULL; - } - } - if (cram_stats_add(c->stats[DS_TC], cr->ntags) < 0) - goto block_err; - - cr->aux = BLOCK_SIZE(s->aux_blk); - cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); - BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); - assert(s->aux_blk->byte <= s->aux_blk->alloc); - - return rg; - - block_err: - return NULL; -} - /* * Encodes auxiliary data. Largely duplicated from above, but done so to * keep it simple and avoid a myriad of version ifs. @@ -2299,6 +2195,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { rg = &aux[3]; while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "RG*", 3); continue; } @@ -2307,6 +2205,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "MD*", 3); continue; } } @@ -2325,6 +2225,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, hts_log_error("Unhandled type code for NM tag"); return NULL; } + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "NM*", 3); continue; } } @@ -2378,7 +2280,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // string as byte_array_stop c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); break; case 'A': case 'c': case 'C': { @@ -2397,7 +2299,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2417,7 +2319,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } case 'i': case 'I': case 'f': { @@ -2436,7 +2338,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2456,7 +2358,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2561,8 +2463,15 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->u.e_byte_array_len.len_codec->out = tm->blk; - codec->u.e_byte_array_len.val_codec->out = tm->blk; + if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) { + if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128))) + return NULL; + codec->u.e_byte_array_len.len_codec->out = tm->blk2; + codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk; + } else { + codec->u.e_byte_array_len.len_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; + } } // skip TN field @@ -2737,6 +2646,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->s_num_bases = 0; c->n_mapped = 0; + // QO field: 0 implies original orientation, 1 implies sequence orientation + // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff. + // Conversely other data sets it's often better than 1% saving for 0. + // Short of trying both and learning, for now we use use 0 for V4, 1 for V3. + c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1; + return c; } @@ -2822,6 +2737,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, qual = cp = (char *)bam_qual(b); + /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; @@ -3048,10 +2964,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - if (CRAM_MAJOR_VERS(fd->version) == 1) - rg = cram_encode_aux_1_0(fd, b, c, s, cr); - else - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); if (err) goto block_err; @@ -3086,7 +2999,19 @@ static int process_one_read(cram_fd *fd, cram_container *c, char *from = (char *)&bam_qual(b)[0]; char *to = &cp[0]; memcpy(to, from, cr->len); - //for (i = 0; i < cr->len; i++) cp[i] = from[i]; + + // Store quality in original orientation for better compression. + if (!c->qs_seq_orient) { + if (cr->flags & BAM_FREVERSE) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = to[i]; + to[i] = to[j]; + to[j] = c; + } + } + } } BLOCK_SIZE(s->qual_blk) += cr->len; } else { @@ -3139,11 +3064,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. - if ((bam_ins_size(b) && - llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || - (!bam_ins_size(b) && !fd->tlen_zero)) - goto detached; - if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) && !(fd->tlen_zero && bam_mate_pos(b) == 0)) goto detached; @@ -3162,10 +3082,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || - (!p->tlen && !fd->tlen_zero)) - goto detached; - if (p->mate_pos != cr->apos && !(fd->tlen_zero && p->mate_pos == 0)) goto detached; @@ -3191,6 +3107,29 @@ static int process_one_read(cram_fd *fd, cram_container *c, !((p->cram_flags & CRAM_FLAG_DISCARD_NAME)))) goto detached; + // Now check TLEN. We do this last as sometimes it's the + // only thing that differs. In CRAM4 we have a better way + // of handling this that doesn't break detached status + int explicit_tlen = 0; + int tflag1 = ((bam_ins_size(b) && + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!bam_ins_size(b) && !fd->tlen_zero)); + + int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!p->tlen && !fd->tlen_zero)); + + if (tflag1 || tflag2) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN; + } else { + // Stil do detached for unmapped data in CRAM4 as this + // also impacts RNEXT calculation. + goto detached; + } + } + /* * The fields below are unused when encoding this read as it is * no longer detached. In theory they may get referred to when @@ -3201,7 +3140,9 @@ static int process_one_read(cram_fd *fd, cram_container *c, * not emitted. */ cr->mate_pos = p->apos; - cr->tlen = sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_NP], cr->mate_pos); + cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_TS], cr->tlen); cr->mate_flags = ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; @@ -3210,7 +3151,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { cram_stats_del(c->stats[DS_NP], p->mate_pos); cram_stats_del(c->stats[DS_MF], p->mate_flags); - cram_stats_del(c->stats[DS_TS], p->tlen); + if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)) + cram_stats_del(c->stats[DS_TS], p->tlen); cram_stats_del(c->stats[DS_NS], p->mate_ref_id); } @@ -3226,6 +3168,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Clear detached from cr flags cr->cram_flags &= ~CRAM_FLAG_DETACHED; + cr->cram_flags |= explicit_tlen; if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; @@ -3236,7 +3179,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } p->cram_flags &= ~CRAM_FLAG_DETACHED; - p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; + p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;; if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; diff --git a/cram/cram_external.c b/cram/cram_external.c index d0fd48cc8..88175103d 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -333,15 +333,17 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); - int32_t i32; + int32_t i32, err = 0; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; op = cp; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); i32 += (cp-op); + if (err) + return -2; //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); diff --git a/cram/cram_io.c b/cram/cram_io.c index fa8b3e346..3583021f7 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -80,12 +80,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include // CRAM v4.0 variable-size integers #else #include "../htscodecs/htscodecs/rANS_static.h" #include "../htscodecs/htscodecs/rANS_static4x16.h" #include "../htscodecs/htscodecs/arith_dynamic.h" #include "../htscodecs/htscodecs/tokenise_name3.h" #include "../htscodecs/htscodecs/fqzcomp_qual.h" +#include "../htscodecs/htscodecs/varint.h" #endif //#define REF_DEBUG @@ -261,6 +263,112 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return 5; } +/* + * Stores a value to memory in ITF-8 format. + * + * Returns the number of bytes required to store the number. + * This is a maximum of 5 bytes. + */ +static inline int itf8_put(char *cp, int32_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~0x00000007f)) { // 1 byte + *up = val; + return 1; + } else if (!(val & ~0x00003fff)) { // 2 byte + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~0x01fffff)) { // 3 byte + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~0x0fffffff)) { // 4 byte + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else { // 5 byte + *up++ = 0xf0 | ((val>>28) & 0xff); + *up++ = (val >> 20) & 0xff; + *up++ = (val >> 12) & 0xff; + *up++ = (val >> 4 ) & 0xff; + *up = val & 0x0f; + return 5; + } +} + + +/* 64-bit itf8 variant */ +static inline int ltf8_put(char *cp, int64_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~((1LL<<7)-1))) { + *up = val; + return 1; + } else if (!(val & ~((1LL<<(6+8))-1))) { + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~((1LL<<(5+2*8))-1))) { + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~((1LL<<(4+3*8))-1))) { + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else if (!(val & ~((1LL<<(3+4*8))-1))) { + *up++ = (val >> 32) | 0xf0; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 5; + } else if (!(val & ~((1LL<<(2+5*8))-1))) { + *up++ = (val >> 40) | 0xf8; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 6; + } else if (!(val & ~((1LL<<(1+6*8))-1))) { + *up++ = (val >> 48) | 0xfc; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 7; + } else if (!(val & ~((1LL<<(7*8))-1))) { + *up++ = (val >> 56) | 0xfe; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 8; + } else { + *up++ = 0xff; + *up++ = (val >> 56) & 0xff; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 9; + } +} + /* * Encodes and writes a single integer in ITF-8 format. * Returns 0 on success @@ -528,6 +636,315 @@ int ltf8_put_blk(cram_block *blk, int64_t val) { return -1; } +static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { + const unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 5 && + (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return ((up[0] <<8) | up[1]) & 0x3fff; + } else if (up[0] < 0xe0) { + (*cp)+=3; + return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; + } else if (up[0] < 0xf0) { + (*cp)+=4; + uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; + return (int32_t)uv; + } else { + (*cp)+=5; + uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); + return (int32_t)uv; + } +} + +static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { + unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 9 && + (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return (((uint64_t)up[0]<< 8) | + (uint64_t)up[1]) & (((1LL<<(6+8)))-1); + } else if (up[0] < 0xe0) { + (*cp)+=3; + return (((uint64_t)up[0]<<16) | + ((uint64_t)up[1]<< 8) | + (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); + } else if (up[0] < 0xf0) { + (*cp)+=4; + return (((uint64_t)up[0]<<24) | + ((uint64_t)up[1]<<16) | + ((uint64_t)up[2]<< 8) | + (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); + } else if (up[0] < 0xf8) { + (*cp)+=5; + return (((uint64_t)up[0]<<32) | + ((uint64_t)up[1]<<24) | + ((uint64_t)up[2]<<16) | + ((uint64_t)up[3]<< 8) | + (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); + } else if (up[0] < 0xfc) { + (*cp)+=6; + return (((uint64_t)up[0]<<40) | + ((uint64_t)up[1]<<32) | + ((uint64_t)up[2]<<24) | + ((uint64_t)up[3]<<16) | + ((uint64_t)up[4]<< 8) | + (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); + } else if (up[0] < 0xfe) { + (*cp)+=7; + return (((uint64_t)up[0]<<48) | + ((uint64_t)up[1]<<40) | + ((uint64_t)up[2]<<32) | + ((uint64_t)up[3]<<24) | + ((uint64_t)up[4]<<16) | + ((uint64_t)up[5]<< 8) | + (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); + } else if (up[0] < 0xff) { + (*cp)+=8; + return (((uint64_t)up[1]<<48) | + ((uint64_t)up[2]<<40) | + ((uint64_t)up[3]<<32) | + ((uint64_t)up[4]<<24) | + ((uint64_t)up[5]<<16) | + ((uint64_t)up[6]<< 8) | + (uint64_t)up[7]) & ((1LL<<(7*8))-1); + } else { + (*cp)+=9; + return (((uint64_t)up[1]<<56) | + ((uint64_t)up[2]<<48) | + ((uint64_t)up[3]<<40) | + ((uint64_t)up[4]<<32) | + ((uint64_t)up[5]<<24) | + ((uint64_t)up[6]<<16) | + ((uint64_t)up[7]<< 8) | + (uint64_t)up[8]); + } +} + +// Wrapper for now +int safe_itf8_put(char *cp, const char *cp_end, int32_t val) { + return itf8_put(cp, val); +} + +int safe_ltf8_put(char *cp, const char *cp_end, int64_t val) { + return ltf8_put(cp, val); +} + +int itf8_size(int64_t v) { + return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); +} + +//----------------------------------------------------------------------------- + +// CRAM v4.0 onwards uses a different variable sized integer encoding +// that is size agnostic. + +// Local interface to varint.h inline version, so we can use in func ptr. +// Note a lot of these use the unsigned interface but take signed int64_t. +// This is because the old CRAM ITF8 inteface had signed -1 as unsigned +// 0xffffffff. +static int uint7_size(int64_t v) { + return var_size_u64(v); +} + +static int64_t uint7_get_32(char **cp, const char *endp, int *err) { + uint32_t val; + int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_32(char **cp, const char *endp, int *err) { + int32_t val; + int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t uint7_get_64(char **cp, const char *endp, int *err) { + uint64_t val; + int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_64(char **cp, const char *endp, int *err) { + int64_t val; + int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int uint7_put_32(char *cp, const char *endp, int32_t val) { + return var_put_u32((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int sint7_put_32(char *cp, const char *endp, int32_t val) { + return var_put_s32((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int uint7_put_64(char *cp, const char *endp, int64_t val) { + return var_put_u64((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int sint7_put_64(char *cp, const char *endp, int64_t val) { + return var_put_s64((uint8_t *)cp, (const uint8_t *)endp, val); +} + +// Put direct to to cram_block +static int uint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_u32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_s32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int uint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_u64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_s64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +// Decode 32-bits with CRC update from cram_fd +static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +// Decode 64-bits with CRC update from cram_fd +static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { + uint8_t b[10], i = 0; + int c; + uint64_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u64(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 10 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +//----------------------------------------------------------------------------- + /* * Decodes a 32-bit little endian value from fd and stores in val. * @@ -926,6 +1343,9 @@ cram_block *cram_new_block(enum cram_content_type content_type, b->alloc = 0; b->byte = 0; b->bit = 7; // MSB + b->crc32 = 0; + b->idx = 0; + b->m = NULL; return b; } @@ -948,9 +1368,9 @@ cram_block *cram_read_block(cram_fd *fd) { c = b->method; crc = crc32(crc, &c, 1); if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } c = b->content_type; crc = crc32(crc, &c, 1); - if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); @@ -1029,13 +1449,18 @@ uint32_t cram_block_size(cram_block *b) { * -1 on failure */ int cram_write_block(cram_fd *fd, cram_block *b) { + char vardata[100]; + int vardata_o = 0; + assert(b->method != RAW || (b->comp_size == b->uncomp_size)); if (hputc(b->method, fd->fp) == EOF) return -1; if (hputc(b->content_type, fd->fp) == EOF) return -1; - if (itf8_encode(fd, b->content_id) == -1) return -1; - if (itf8_encode(fd, b->comp_size) == -1) return -1; - if (itf8_encode(fd, b->uncomp_size) == -1) return -1; + vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); + if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) + return -1; if (b->data) { if (b->method == RAW) { @@ -1051,15 +1476,15 @@ int cram_write_block(cram_fd *fd, cram_block *b) { } if (CRAM_MAJOR_VERS(fd->version) >= 3) { - unsigned char dat[100], *cp = dat;; + char dat[100], *cp = (char *)dat; uint32_t crc; *cp++ = b->method; *cp++ = b->content_type; - cp += itf8_put((char*)cp, b->content_id); - cp += itf8_put((char*)cp, b->comp_size); - cp += itf8_put((char*)cp, b->uncomp_size); - crc = crc32(0L, dat, cp-dat); + cp += fd->vv.varint_put32(cp, dat+100, b->content_id); + cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); + cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); + crc = crc32(0L, (uc *)dat, cp-dat); if (b->method == RAW) { b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); @@ -1235,10 +1660,10 @@ int cram_uncompress_block(cram_block *b) { break; } - case NAME_TOK3: { + case TOK3: { uint32_t out_len; uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); - b->orig_method = NAME_TOK3; + b->orig_method = TOK3; b->method = RAW; free(b->data); b->data = cp; @@ -1380,11 +1805,11 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, return (char *)cp; } - case NAME_TOK3: - case NAME_TOKA: { + case TOK3: + case TOKA: { int out_len; int lev = level; - if (method == NAME_TOK3 && lev > 3) + if (method == TOK3 && lev > 3) lev = 3; uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); *out_size = out_len; @@ -1416,6 +1841,9 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, cram_block *b, cram_metrics *metrics, int method, int level) { + if (!b) + return 0; + char *comp = NULL; size_t comp_size = 0; int strat; @@ -1508,13 +1936,13 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, method = (method|(1<metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) { - if (method & (1<version)+256; break; case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; - case NAME_TOK3:strat = 0; break; - case NAME_TOKA:strat = 1; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; default: strat = 0; } @@ -1547,13 +1975,10 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw } } - //fprintf(stderr, "sz_best = %d\n", sz_best); if (c_best) { free(b->data); b->data = (unsigned char *)c_best; - //printf("method_best = %s\n", cram_block_method2str(method_best)); - b->method = method_best; // adjusted to methmap[method_best] later b->comp_size = sz_best; } @@ -1625,7 +2050,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, } // else cost is ignored for (m = 0; m < CRAM_MAX_METHOD; m++) { - if ((!metrics->sz[m]) || (!(method & (1<sz[m]) || (!(method & (1u< metrics->sz[m]) @@ -1669,12 +2094,12 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int mul = 1+(fd->level>=7); if (++metrics->cnt[m] >= MAXFAILS*mul && (metrics->extra[m] += r) >= MAXDELTA*mul) - method &= ~(1<sz[m] > best_sz) - method &= ~(1<pos_sorted = 1; c->max_apos = 0; c->multi_seq = 0; + c->qs_seq_orient = 1; c->bams = NULL; @@ -3191,7 +3617,7 @@ cram_container *cram_read_container(cram_fd *fd) { memset(&c2, 0, sizeof(c2)); if (CRAM_MAJOR_VERS(fd->version) == 1) { - if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { fd->eof = fd->empty_container ? 1 : 2; return NULL; } else { @@ -3212,54 +3638,53 @@ cram_container *cram_read_container(cram_fd *fd) { len = le_int4(c2.length); crc = crc32(0L, (unsigned char *)&len, 4); } - if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; -#else - int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; - c2.ref_seq_start = i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; - c2.ref_seq_span = i32; -#endif - if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + int64_t i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i64; + } else { + int32_t i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; + } + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { c2.record_counter = 0; c2.num_bases = 0; } else { if (CRAM_MAJOR_VERS(fd->version) >= 3) { - if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) return NULL; else rd += s; } else { int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd += s; c2.record_counter = i32; } - if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) return NULL; else rd += s; } - if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) + return NULL; + else + rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) + return NULL; + else + rd+=s; if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) return NULL; @@ -3275,7 +3700,7 @@ cram_container *cram_read_container(cram_fd *fd) { return NULL; } for (i = 0; i < c->num_landmarks; i++) { - if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { cram_free_container(c); return NULL; } else { @@ -3337,7 +3762,7 @@ int cram_container_size(cram_container *c) { */ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) { - unsigned char *cp = (unsigned char *)dat; + char *cp = (char *)dat; int i; // Check the input buffer is large enough according to our stated @@ -3346,41 +3771,39 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) return -1; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); + cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); cp += 4; } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, -2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - cp += itf8_put((char*)cp, c->num_records); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat); + c->crc32 = crc32(0L, (uc *)dat, cp-dat); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3388,7 +3811,7 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += 4; } - *size = (char *)cp-dat; // actual used size + *size = cp-dat; // actual used size return 0; } @@ -3401,50 +3824,49 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) * -1 on failure */ int cram_write_container(cram_fd *fd, cram_container *c) { - char buf_a[1024], *buf = buf_a; - unsigned char *cp; + char buf_a[1024], *buf = buf_a, *cp; int i; - if (55 + c->num_landmarks * 5 >= 1024) - buf = malloc(55 + c->num_landmarks * 5); - cp = (unsigned char *)buf; + if (61 + c->num_landmarks * 10 >= 1024) { + buf = malloc(61 + c->num_landmarks * 10); + if (!buf) + return -1; + } + cp = buf; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); + cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); cp += 4; } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif - } - cp += itf8_put((char*)cp, c->num_records); - if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); - } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); + if (CRAM_MAJOR_VERS(fd->version) >= 3) + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); + else + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf); + c->crc32 = crc32(0L, (uc *)buf, cp-buf); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3452,7 +3874,7 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += 4; } - if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) { + if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { if (buf != buf_a) free(buf); return -1; @@ -4044,8 +4466,8 @@ cram_file_def *cram_read_file_def(cram_fd *fd) { return NULL; } - if (def->major_version > 3) { - hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d", + if (def->major_version > 4) { + hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", def->major_version, def->minor_version); free(def); return NULL; @@ -4134,9 +4556,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { } len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); /* Extract header from 1st block */ if (-1 == int32_get_blk(b, &header_len) || @@ -4163,9 +4585,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { return NULL; } len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); cram_free_block(b); } @@ -4359,9 +4781,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { if (blank_block) { c->length = b->comp_size + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); c->num_blocks = 2; c->num_landmarks = 2; @@ -4376,8 +4798,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { // Plus extra storage for uncompressed secondary blank block padded_length = MIN(c->length*.5, 10000); c->length += padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(padded_length)*2; + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(padded_length)*2; } else { // Pad the block instead. c->num_blocks = 1; @@ -4390,9 +4812,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { c->length = b->comp_size + padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); if (NULL == (pads = calloc(1, padded_length))) { cram_free_block(b); @@ -4453,6 +4875,51 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { * The top-level cram opening, closing and option handling */ +/* + * Sets CRAM variable sized integer decode function tables. + * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. + * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit + * (or anything inbetween), but also now supports signed values. + * + * Version is the CRAM major version number. + * vv is the vector table (probably &cram_fd->vv) + */ +static void cram_init_varint(varint_vec *vv, int version) { + if (version >= 4) { + vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic + vv->varint_get32s = sint7_get_32; + vv->varint_get64 = uint7_get_64; + vv->varint_get64s = sint7_get_64; + vv->varint_put32 = uint7_put_32; + vv->varint_put32s = sint7_put_32; + vv->varint_put64 = uint7_put_64; + vv->varint_put64s = sint7_put_64; + vv->varint_put32_blk = uint7_put_blk_32; + vv->varint_put32s_blk = sint7_put_blk_32; + vv->varint_put64_blk = uint7_put_blk_64; + vv->varint_put64s_blk = sint7_put_blk_64; + vv->varint_size = uint7_size; + vv->varint_decode32_crc = uint7_decode_crc32; + vv->varint_decode64_crc = uint7_decode_crc64; + } else { + vv->varint_get32 = safe_itf8_get; + vv->varint_get32s = safe_itf8_get; + vv->varint_get64 = safe_ltf8_get; + vv->varint_get64s = safe_ltf8_get; + vv->varint_put32 = safe_itf8_put; + vv->varint_put32s = safe_itf8_put; + vv->varint_put64 = safe_ltf8_put; + vv->varint_put64s = safe_ltf8_put; + vv->varint_put32_blk = itf8_put_blk; + vv->varint_put32s_blk = itf8_put_blk; + vv->varint_put64_blk = ltf8_put_blk; + vv->varint_put64s_blk = ltf8_put_blk; + vv->varint_size = itf8_size; + vv->varint_decode32_crc = itf8_decode_crc; + vv->varint_decode64_crc = ltf8_decode_crc; + } +} + /* * Initialises the lookup tables. These could be global statics, but they're * clumsy to setup in a multi-threaded environment unless we generate @@ -4535,6 +5002,8 @@ static void cram_init_tables(cram_fd *fd) { fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; } + + cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); } // Default version numbers for CRAM @@ -4602,6 +5071,8 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->version = fd->file_def->major_version * 256 + fd->file_def->minor_version; + cram_init_tables(fd); + if (!(fd->header = cram_read_SAM_hdr(fd))) { cram_free_file_def(fd->file_def); goto err; @@ -4625,12 +5096,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { strncpy(def->file_id, filename, 20); fd->version = major_version * 256 + minor_version; + cram_init_tables(fd); /* SAM header written later along with this file_def */ } - cram_init_tables(fd); - fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); if (!fd->prefix) goto err; @@ -4754,6 +5224,88 @@ int cram_flush(cram_fd *fd) { return 0; } +/* + * Writes an EOF block to a CRAM file. + * + * Returns 0 on success + * -1 on failure + */ +int cram_write_eof_block(cram_fd *fd) { + // EOF block is a container with special values to aid detection + if (CRAM_MAJOR_VERS(fd->version) >= 2) { + // Empty container with + // ref_seq_id -1 + // start pos 0x454f46 ("EOF") + // span 0 + // nrec 0 + // counter 0 + // nbases 0 + // 1 block (landmark 0) + // (CRC32) + cram_container c; + memset(&c, 0, sizeof(c)); + c.ref_seq_id = -1; + c.ref_seq_start = 0x454f46; // "EOF" + c.ref_seq_span = 0; + c.record_counter = 0; + c.num_bases = 0; + c.num_blocks = 1; + int32_t land[1] = {0}; + c.landmark = land; + + // An empty compression header block with + // method raw (0) + // type comp header (1) + // content id 0 + // block contents size 6 + // raw size 6 + // empty preservation map (01 00) + // empty data series map (01 00) + // empty tag map (01 00) + // block CRC + cram_block_compression_hdr ch; + memset(&ch, 0, sizeof(ch)); + c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch); + + c.length = c.comp_hdr_block->byte // Landmark[0] + + 5 // block struct + + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC + if (cram_write_container(fd, &c) < 0 || + cram_write_block(fd, c.comp_hdr_block) < 0) { + cram_close(fd); + cram_free_block(c.comp_hdr_block); + return -1; + } + cram_free_block(c.comp_hdr_block); + + // V2.1 bytes + // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + + // V3.0 bytes: + // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 05 bd d9 4f // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + + // V4.0 bytes: + // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id + // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // ac d6 05 bc // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + } + + return 0; +} /* * Closes a CRAM file. * Returns 0 on success @@ -4797,25 +5349,8 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w') { /* Write EOF block */ - if (CRAM_MAJOR_VERS(fd->version) == 3) { - if (38 != hwrite(fd->fp, - "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR - "\x00\x01\x00" // Cont HDR - "\x05\xbd\xd9\x4f" // CRC32 - "\x00\x01\x00\x06\x06" // Comp.HDR blk - "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk - "\xee\x63\x01\x4b", // CRC32 - 38)) - return -1; - } else { - if (30 != hwrite(fd->fp, - "\x0b\x00\x00\x00\xff\xff\xff\xff" - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" - "\x00\x01\x00\x00\x01\x00\x06\x06" - "\x01\x00\x01\x00\x01\x00", 30)) - return -1; - } + if (0 != cram_write_eof_block(fd)) + return -1; } for (bl = fd->bl; bl; bl = next) { @@ -5048,8 +5583,9 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } if (!((major == 1 && minor == 0) || (major == 2 && (minor == 0 || minor == 1)) || - (major == 3 && (minor == 0 || minor == 1)))) { - hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0 or 3.1"); + (major == 3 && (minor == 0 || minor == 1)) || + (major == 4 && minor == 0))) { + hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0"); errno = EINVAL; return -1; } @@ -5066,8 +5602,11 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; - fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) >= 3 && - CRAM_MINOR_VERS(fd->version) >= 1)) ? 1 : 0; + fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 && + CRAM_MINOR_VERS(fd->version) >= 1) || + CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0; + cram_init_tables(fd); + break; } diff --git a/cram/cram_io.h b/cram/cram_io.h index 3954a49af..7d787e44c 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -68,308 +68,9 @@ extern "C" { */ int itf8_decode(cram_fd *fd, int32_t *val); -static inline int itf8_get(char *cp, int32_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - return 5; - } -} - -/* - * Stores a value to memory in ITF-8 format. - * - * Returns the number of bytes required to store the number. - * This is a maximum of 5 bytes. - */ -static inline int itf8_put(char *cp, int32_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~0x00000007f)) { // 1 byte - *up = val; - return 1; - } else if (!(val & ~0x00003fff)) { // 2 byte - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~0x01fffff)) { // 3 byte - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~0x0fffffff)) { // 4 byte - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else { // 5 byte - *up++ = 0xf0 | ((val>>28) & 0xff); - *up++ = (val >> 20) & 0xff; - *up++ = (val >> 12) & 0xff; - *up++ = (val >> 4 ) & 0xff; - *up = val & 0x0f; - return 5; - } -} - - -/* 64-bit itf8 variant */ -static inline int ltf8_put(char *cp, int64_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~((1LL<<7)-1))) { - *up = val; - return 1; - } else if (!(val & ~((1LL<<(6+8))-1))) { - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~((1LL<<(5+2*8))-1))) { - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~((1LL<<(4+3*8))-1))) { - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else if (!(val & ~((1LL<<(3+4*8))-1))) { - *up++ = (val >> 32) | 0xf0; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 5; - } else if (!(val & ~((1LL<<(2+5*8))-1))) { - *up++ = (val >> 40) | 0xf8; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 6; - } else if (!(val & ~((1LL<<(1+6*8))-1))) { - *up++ = (val >> 48) | 0xfc; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 7; - } else if (!(val & ~((1LL<<(7*8))-1))) { - *up++ = (val >> 56) | 0xfe; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 8; - } else { - *up++ = 0xff; - *up++ = (val >> 56) & 0xff; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 9; - } -} - -static inline int ltf8_get(char *cp, int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - -#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5) - - -/* Version of itf8_get that checks it hasn't run out of input */ - extern const int itf8_bytes[16]; extern const int ltf8_bytes[256]; -static inline int safe_itf8_get(const char *cp, const char *endp, - int32_t *val_p) { - const unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 5 && - (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) { - *val_p = 0; - return 0; - } - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; - return 5; - } -} - -static inline int safe_ltf8_get(const char *cp, const char *endp, - int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 9 && - (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - /*! Pushes a value in ITF8 format onto the end of a block. * * This shouldn't be used for high-volume data as it is not the fastest @@ -544,8 +245,10 @@ static inline int block_append(cram_block *b, const void *s, size_t len) { if (block_grow(b, len) < 0) return -1; - memcpy(BLOCK_END(b), s, len); - BLOCK_SIZE(b) += len; + if (len) { + memcpy(BLOCK_END(b), s, len); + BLOCK_SIZE(b) += len; + } return 0; } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 1c51b09f7..6a0dc3fba 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -111,7 +111,10 @@ enum cram_encoding { E_SUBEXP = 7, E_GOLOMB_RICE = 8, E_GAMMA = 9, - E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ + E_XPACK = 11, // Transform to sub-codec + E_XRLE = 12, // Transform to sub-codec + E_XDELTA = 13, // Transform to sub-codec + E_NUM_CODECS, /* Total number of codecs, not a real one. */ }; enum cram_external_type { @@ -120,6 +123,8 @@ enum cram_external_type { E_BYTE = 3, E_BYTE_ARRAY = 4, E_BYTE_ARRAY_BLOCK = 5, + E_SINT = 6, // signed INT + E_SLONG = 7, // signed LONG }; /* External IDs used by this implementation (only assumed during writing) */ @@ -283,6 +288,7 @@ struct cram_block_compression_hdr { // indexed by ref-base and subst. code char substitution_matrix[5][4]; int no_ref; + int qs_seq_orient; // 1 => same as seq. 0 => original orientation // TD Dictionary as a concatenated block cram_block *TD_blk; // Tag Dictionary @@ -299,6 +305,9 @@ struct cram_block_compression_hdr { char *uncomp; // A single block of uncompressed data size_t uncomp_size, uncomp_alloc; + + // Total codec count, used for index to block_by_id for transforms + int ncodecs; }; typedef struct cram_map { @@ -313,6 +322,7 @@ typedef struct cram_map { typedef struct cram_tag_map { struct cram_codec *codec; cram_block *blk; + cram_block *blk2; cram_metrics *m; } cram_tag_map; @@ -378,6 +388,7 @@ struct cram_container { int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. + int qs_seq_orient; // 1 => same as seq. 0 => original orientation /* Copied from fd before encoding, to allow multi-threading */ int ref_start, first_base, last_base, ref_id, ref_end; @@ -418,6 +429,7 @@ typedef struct cram_record { int32_t mate_ref_id; int64_t mate_pos; // NP int64_t tlen; // TS + int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing // Auxiliary data int32_t ntags; // TC @@ -663,6 +675,35 @@ typedef struct spare_bams { struct spare_bams *next; } spare_bams; +struct cram_fd; +typedef struct varint_vec { + // Returns number of bytes decoded from fd, 0 on error + int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); + + // Returns the value and increments *cp. Sets err to 1 iff an error occurs. + // NOTE: Does not set err to 0 on success. + int64_t (*varint_get32) (char **cp, const char *endp, int *err); + int64_t (*varint_get32s)(char **cp, const char *endp, int *err); + int64_t (*varint_get64) (char **cp, const char *endp, int *err); + int64_t (*varint_get64s)(char **cp, const char *endp, int *err); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32) (char *cp, const char *endp, int32_t val_p); + int (*varint_put32s)(char *cp, const char *endp, int32_t val_p); + int (*varint_put64) (char *cp, const char *endp, int64_t val_p); + int (*varint_put64s)(char *cp, const char *endp, int64_t val_p); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32_blk) (cram_block *blk, int32_t val_p); + int (*varint_put32s_blk)(cram_block *blk, int32_t val_p); + int (*varint_put64_blk) (cram_block *blk, int64_t val_p); + int (*varint_put64s_blk)(cram_block *blk, int64_t val_p); + + // Returns number of bytes needed to encode 'val' + int (*varint_size)(int64_t val); +} varint_vec; + struct cram_fd { struct hFILE *fp; int mode; // 'r' or 'w' @@ -757,6 +798,10 @@ struct cram_fd { int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) BGZF *idxfp; // File pointer for on-the-fly index creation + + // variable integer decoding callbacks. + // This changed in CRAM4.0 to a data-size agnostic encoding. + varint_vec vv; }; // Translation of required fields to cram data series @@ -838,7 +883,8 @@ enum cram_fields { #define CRAM_FLAG_DETACHED (1<<1) #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) #define CRAM_FLAG_NO_SEQ (1<<3) -#define CRAM_FLAG_MASK ((1<<4)-1) +#define CRAM_FLAG_EXPLICIT_TLEN (1<<4) +#define CRAM_FLAG_MASK ((1<<5)-1) /* Internal only */ #define CRAM_FLAG_STATS_ADDED (1<<30) diff --git a/hts.c b/hts.c index 8e4552f87..69b736121 100644 --- a/hts.c +++ b/hts.c @@ -1169,7 +1169,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) fp->fp.cram = cram_dopen(hfile, fn, simple_mode); if (fp->fp.cram == NULL) goto error; if (!fp->is_write) - cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); + cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto fp->is_cram = 1; break; diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d947f985f..d40bbd095 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -3,6 +3,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ + $(HTSPREFIX)htscodecs/htscodecs/rle.c \ $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk index f8d4d7d4a..f1d82faf8 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -6,6 +6,7 @@ htscodecs_fqzcomp_qual_h = htscodecs_pack_h = htscodecs_rANS_static_h = htscodecs_rANS_static4x16_h = +htscodecs_rle_h = htscodecs_tokenise_name3_h = htscodecs_varint_h = diff --git a/htslib/cram.h b/htslib/cram.h index 890896388..5446945e4 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -48,16 +48,22 @@ extern "C" { #endif enum cram_block_method { + // Public methods as defined in the CRAM spec. BM_ERROR = -1, + + // CRAM 2.x and 3.0 RAW = 0, GZIP = 1, BZIP2 = 2, LZMA = 3, RANS = 4, RANS0 = RANS, + + // CRAM 3.1 onwards RANSPR = 5, RANS_PR0 = RANSPR, ARITH = 6, ARITH_PR0 = ARITH, FQZ = 7, - TOK3 = 8, NAME_TOK3 = TOK3, + TOK3 = 8, + // BSC = 9, ZSTD = 10 // Methods not externalised, but used in metrics. // Externally they become one of the above methods. @@ -78,8 +84,8 @@ enum cram_block_method { RANS_PR192, // O0 + RLE + pack RANS_PR193, // O1 + RLE + pack - //NAME_TOK3, // tok+rans - NAME_TOKA, // tok+arith + //TOK3, // tok+rans + TOKA, // tok+arith //ARITH_PR0, // Order 0 ARITH_PR1, // Order 1 diff --git a/test/test.pl b/test/test.pl index 823a414c5..8def186d9 100755 --- a/test/test.pl +++ b/test/test.pl @@ -610,6 +610,34 @@ sub test_view testv $opts, "./test_view $tv_args $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 4.0 support. + # SAM -> CRAM40u -> SAM + foreach my $profile (qw/fast normal small archive/) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=4.0 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + + # BAM -> CRAM40 -> BAM -> SAM + $cram = "$bam.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $bam > $cram"; + testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; + testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; + + # CRAM40 -> CRAM30 + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; + + # CRAM30 -> CRAM40 + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $cram.cram > $cram"; + + # CRAM40 -> CRAM40 + multi-slice + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; + testv $opts, "./test_view $tv_args $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; if (-e $jcram) { From 192c8c0f78c657f68122ab3ed64edff64fda024d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 29 Sep 2020 15:28:34 +0100 Subject: [PATCH 043/114] Added a "pos_delta" option for forcing AP data series delta. Normally sorted data deltas the positions and unsorted does not. This allows us to explicitly reenable the AP=1 container header field to do deltaing, as it's sometimes useful on unsorted data. Eg pairs are normally neighbouring position, so it's still a bit better to compress than leaving them as-is. --- cram/cram_decode.c | 10 +++++++++- cram/cram_encode.c | 5 +++-- cram/cram_io.c | 5 +++++ cram/cram_structs.h | 6 ++++++ hts.c | 4 ++++ htslib/hts.h | 1 + 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 06e297331..8735aafa5 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2565,8 +2565,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->apos = i32; } if (r) goto block_err;; - if (c->comp_hdr->AP_delta) + if (c->comp_hdr->AP_delta) { + if (cr->apos < 0 && c->unsorted == 0) { + // cache locally in c->unsorted so we don't have an + // excessive number of locks + pthread_mutex_lock(&fd->ref_lock); + c->unsorted = fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + } cr->apos += s->last_apos; + } s->last_apos= cr->apos; } else { cr->apos = c->ref_seq_start; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index cdd73b8a7..bb5aaea76 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1866,6 +1866,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; h->qs_seq_orient = c->qs_seq_orient; + // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1) h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); @@ -2707,7 +2708,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; if (c->pos_sorted) { - if (cr->apos < s->last_apos) { + if (cr->apos < s->last_apos && !fd->ap_delta) { c->pos_sorted = 0; } else { if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0) @@ -3320,7 +3321,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { // We detected we need multi-seq fd->multi_seq = 1; c->multi_seq = 1; - c->pos_sorted = 0; // required atm for multi_seq slices + c->pos_sorted = 0; if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); diff --git a/cram/cram_io.c b/cram/cram_io.c index 3583021f7..1f118aae4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5121,6 +5121,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->slices_per_container = SLICE_PER_CNT; fd->embed_ref = 0; fd->no_ref = 0; + fd->ap_delta = 0; fd->ignore_md5 = 0; fd->lossy_read_names = 0; fd->use_bz2 = 0; @@ -5494,6 +5495,10 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->no_ref = va_arg(args, int); break; + case CRAM_OPT_POS_DELTA: + fd->ap_delta = va_arg(args, int); + break; + case CRAM_OPT_IGNORE_MD5: fd->ignore_md5 = va_arg(args, int); break; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 6a0dc3fba..a54e2ec6f 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -802,6 +802,12 @@ struct cram_fd { // variable integer decoding callbacks. // This changed in CRAM4.0 to a data-size agnostic encoding. varint_vec vv; + + // Force AP delta even on non positional sorted data. + // This can be beneficial for pairs where pairs are nearby each other. + // We suffer with delta to unrelated things (previous pair), but gain + // in delta between them. (Ideal would be a per read setting.) + int ap_delta; }; // Translation of required fields to cram data series diff --git a/hts.c b/hts.c index 69b736121..72041d0d5 100644 --- a/hts.c +++ b/hts.c @@ -745,6 +745,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "NO_REF") == 0) o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val); + else if (strcmp(o->arg, "pos_delta") == 0 || + strcmp(o->arg, "POS_DELTA") == 0) + o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val); + else if (strcmp(o->arg, "ignore_md5") == 0 || strcmp(o->arg, "IGNORE_MD5") == 0) o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val); diff --git a/htslib/hts.h b/htslib/hts.h index 3c6a3dcb6..0f2fdd021 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -319,6 +319,7 @@ enum hts_fmt_option { CRAM_OPT_USE_TOK, CRAM_OPT_USE_FQZ, CRAM_OPT_USE_ARITH, + CRAM_OPT_POS_DELTA, // force delta for AP, even on non-pos sorted data // General purpose HTS_OPT_COMPRESSION_LEVEL = 100, From b8c3bafffea118eda042383c16cdb4c01fec9f01 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 30 Sep 2020 14:16:51 +0100 Subject: [PATCH 044/114] An alternative (but still commented out) strategy for multi-base vars. This does some basic analysis to spot grouped variants and if and only if long enough and scoring +ve will it switch to 'b'(?) feature. --- cram/cram_encode.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index bb5aaea76..c46460144 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2804,6 +2804,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (!sp[l]) break; if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { +#if 0 // Disabled for the time being as it doesn't // seem to gain us much. int ol=l; @@ -2821,6 +2822,43 @@ static int process_one_read(cram_fd *fd, cram_container *c, qp[l], rp[l])) return -1; } +#else + // With urmap pushed to the limit and lots + // of unaligned data (should be soft-clipped) + // this saves ~2-7%. Worth it? + int nl = l; + int max_end = nl, max_score = 0, score = 0; + while (nl < end) { + if (rp[nl] != sp[nl]) { + score += 3; + if (max_score < score) { + max_score = score; + max_end = nl; + } + } else { + score--; + if (score < -2 || + max_score - score > 7) + break; + } + nl++; + } + if (max_score > 20) { + cram_add_bases(fd, c, s, cr, spos+l, + max_end-l, &seq[spos+l]); + l = max_end-1; + } else { + while (l < nl) { + if (rp[l] != sp[l]) + cram_add_substitution(fd, c, s, + cr, spos+l, + sp[l], qp[l], + rp[l]); + l++; + } + l--; + } +#endif } else { if (cram_add_substitution(fd, c, s, cr, spos+l, sp[l], qp[l], rp[l])) From 1a69f852929b667a65b09f0226ca588c6080c118 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 23 Oct 2020 17:53:06 +0100 Subject: [PATCH 045/114] Small cram encoding speed improvements. - We no longer keep GZIP_RLE used when libdeflate is compiled in because it doesn't support strategy in the same manner as zlib. Instead we use GZIP level 1. - Adjusted the fixed small-block compensation code. It's now equiv to a 60 byte difference required before changing up to the next tier in complexity. This had no impact on my novaseq test normally, but with bzip2+lzma added in it saved 6% CPU (at a cost of 0.14% file growth). --- cram/cram_io.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 1f118aae4..3cea2ba4f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1066,6 +1066,9 @@ static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, if (level >= 8) level += level/8; // 8->10, 9->12 if (level > 12) level = 12; + if (strat == Z_RLE) // not supported by libdeflate + level = 1; + struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) { hts_log_error("Call to libdeflate_alloc_compressor failed"); @@ -1939,6 +1942,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, if (method & (1u<metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) { @@ -1986,7 +1998,11 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Accumulate stats for all methods tried pthread_mutex_lock(&fd->metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) - metrics->sz[m] += sz[m]+50; // don't be overly sure on small blocks + // don't be overly sure on small blocks. + // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 + // needs to be at least 60 bytes smaller to overcome the + // fixed size addition. + metrics->sz[m] += sz[m]+2000; // When enough trials performed, find the best on average if (--metrics->trial == 0) { @@ -2058,7 +2074,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, } if (best_method != metrics->method) { - metrics->trial = (NTRIALS+1)/2; // be sure + //metrics->trial = (NTRIALS+1)/2; // be sure //metrics->next_trial /= 1.5; metrics->consistency = 0; } else { From 46ec44cf2184740aaac91aaaa358fda2ee792e67 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Oct 2020 10:33:14 +0000 Subject: [PATCH 046/114] CRAM 4 updates. Same as io_lib's eda7cfe and 16c19ee. This replaces the length field in containers with a variable size quantity instead of 32-bit int, for consistency with other fields, and marks a few fields as using a signed encoding strategy (necessary for the next commit). [The CRAM 4 specification moved on slightly since the original date of this PR.] --- .cirrus.yml | 2 +- cram/cram_decode.c | 11 +++++--- cram/cram_encode.c | 10 ++++--- cram/cram_io.c | 67 +++++++++++++++++++++++++++++++++++++++++---- cram/cram_structs.h | 1 + htscodecs | 2 +- 6 files changed, 78 insertions(+), 15 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index d4256b11a..6b9bcd8cf 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -4,7 +4,7 @@ # Sadly though there is still a finite limit to macOS of one instance. # Can we cull our Mac test to just one instance? -timeout_in: 10m +timeout_in: 20m #-------------------------------------------------- # Template: build libdeflate dependency diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 8735aafa5..e4ee8986b 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -392,7 +392,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'C' && key[1] == 'F') { ds_id = DS_CF; type = E_INT; } else if (key[0] == 'R' && key[1] == 'I') { - ds_id = DS_RI; type = E_INT; + ds_id = DS_RI; type = E_SINT; } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { @@ -404,7 +404,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { - ds_id = DS_NS; type = E_INT; + ds_id = DS_NS; type = E_SINT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; type = is_v4 ? E_LONG : E_INT; @@ -968,7 +968,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { hdr->content_type = b->content_type; if (b->content_type == MAPPED_SLICE) { - hdr->ref_seq_id = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err); if (CRAM_MAJOR_VERS(fd->version) >= 4) { hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); @@ -2298,7 +2298,10 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, return -1; ref_id = s->hdr->ref_seq_id; - embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + if (CRAM_MAJOR_VERS(fd->version) < 4) + embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + else + embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0; if (ref_id >= 0) { if (embed_ref) { diff --git a/cram/cram_encode.c b/cram/cram_encode.c index c46460144..89c79fa8a 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -508,7 +508,7 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { return NULL; } - cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span); @@ -1084,7 +1084,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, */ /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref ? DS_ref : -1; + s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0 + ? DS_ref + : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1); s->hdr->record_counter = c->num_records + c->record_counter; c->num_records += s->hdr->num_records; @@ -1659,7 +1661,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), - c->stats[DS_NS], E_INT, NULL, + c->stats[DS_NS], E_SINT, NULL, fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; @@ -1781,7 +1783,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), - c->stats[DS_RI], E_INT, NULL, + c->stats[DS_RI], E_SINT, NULL, fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; diff --git a/cram/cram_io.c b/cram/cram_io.c index 3cea2ba4f..7cbd41c05 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -898,6 +898,52 @@ static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return i; } +// Decode 32-bits with CRC update from cram_fd +static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = (v>>1) ^ -(v&1); + return i; +} + + // Decode 64-bits with CRC update from cram_fd static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { uint8_t b[10], i = 0; @@ -3639,7 +3685,7 @@ cram_container *cram_read_container(cram_fd *fd) { } else { rd+=s; } - } else { + } else if (CRAM_MAJOR_VERS(fd->version) < 4) { uint32_t len; if ((s = int32_decode(fd, &c2.length)) == -1) { if (CRAM_MAJOR_VERS(fd->version) == 2 && @@ -3653,8 +3699,15 @@ cram_container *cram_read_container(cram_fd *fd) { } len = le_int4(c2.length); crc = crc32(0L, (unsigned char *)&len, 4); + } else { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { + fd->eof = fd->empty_container ? 1 : 2; + return NULL; + } else { + rd+=s; + } } - if ((s = fd->vv.varint_decode32_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) >= 4) { int64_t i64; if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; @@ -3797,7 +3850,7 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += fd->vv.varint_put32(cp, NULL, 0); cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); @@ -3852,16 +3905,18 @@ int cram_write_container(cram_fd *fd, cram_container *c) { if (CRAM_MAJOR_VERS(fd->version) == 1) { cp += itf8_put(cp, c->length); - } else { + } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { *(int32_t *)cp = le_int4(c->length); cp += 4; + } else { + cp += fd->vv.varint_put32(cp, NULL, c->length); } if (c->multi_seq) { cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); cp += fd->vv.varint_put32(cp, NULL, 0); cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); @@ -4916,6 +4971,7 @@ static void cram_init_varint(varint_vec *vv, int version) { vv->varint_put64s_blk = sint7_put_blk_64; vv->varint_size = uint7_size; vv->varint_decode32_crc = uint7_decode_crc32; + vv->varint_decode32s_crc = sint7_decode_crc32; vv->varint_decode64_crc = uint7_decode_crc64; } else { vv->varint_get32 = safe_itf8_get; @@ -4932,6 +4988,7 @@ static void cram_init_varint(varint_vec *vv, int version) { vv->varint_put64s_blk = ltf8_put_blk; vv->varint_size = itf8_size; vv->varint_decode32_crc = itf8_decode_crc; + vv->varint_decode32s_crc = itf8_decode_crc; vv->varint_decode64_crc = ltf8_decode_crc; } } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index a54e2ec6f..5c39eca56 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -679,6 +679,7 @@ struct cram_fd; typedef struct varint_vec { // Returns number of bytes decoded from fd, 0 on error int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); // Returns the value and increments *cp. Sets err to 1 iff an error occurs. diff --git a/htscodecs b/htscodecs index 99ed6bcca..dca826bbc 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 99ed6bcca2192a3c210fac04939c7e51a0b15a3c +Subproject commit dca826bbc598d6dcae450e92e60dcf789454999d From 96c64aa1f56fdb724565d7a56798515d27cbb1f9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Oct 2020 15:26:40 +0000 Subject: [PATCH 047/114] Next update for CRAM 4.0; now matching io_lib. This is the htslib equivalent to io_lib's 090dfd3f and d86b008. It is now in adherance to the draft CRAM 4 spec. It removes the E_SINT and E_SLONG types and replaces E_EXTERNAL with E_BYTE, E_VARINT_SIGNED and E_VARINT_UNSIGNED. Integer data types aren't explicitly signed or unsigned but simply use the codec that works for them. So if all ref-ids are >= 0 it'll be VARINT_UNSIGNED, otherwise VARINT_SIGNED. This future proofs things by adding introspection to the format. Also removed HUFFMAN from CRAM 4. It was only being used for storing constant values, so we now have CONST_BYTE and CONST_INT instead (which always uses a signed encoding). This should be the final removal of ever encoding negatives as if they were positives (eg -1 as 0xFFFFFFFF) and so in turn removes all size limitations on values. --- cram/cram_codecs.c | 700 ++++++++++++++++++++++++++++++++++++++------ cram/cram_codecs.h | 15 + cram/cram_decode.c | 6 +- cram/cram_encode.c | 77 +++-- cram/cram_stats.c | 16 +- cram/cram_structs.h | 34 ++- 6 files changed, 726 insertions(+), 122 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a598f3964..783fc393d 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -324,6 +324,24 @@ static char *cram_extract_block(cram_block *b, int size) { /* * --------------------------------------------------------------------------- * EXTERNAL + * + * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the + * size of the object being returned. This type is hard coded in the + * spec document (changing from uint32 to uint64 requires a spec change) + * and there is no data format introspection so implementations have + * to determine which size to use based on version numbers. It also + * doesn't support signed data. + * + * With CRAM 4.0 onwards the size and sign of the data is no longer stated + * explicitly in the specification. Instead EXTERNAL is replaced by three + * new encodings, for bytes and signed / unsigned integers which used a + * variable sized encoding. + * + * For simplicity we use the same encode and decode functions for + * bytes (CRAM4) and external (CRAM3). Given we already had code to + * replace codec + type into a function pointer it makes little + * difference how we ended up at that function. However we disallow + * this codec to operate on integer data for CRAM4 onwards. */ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -345,26 +363,6 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, return err ? -1 : 0; } -int cram_external_decode_sint(cram_slice *slice, cram_codec *c, - cram_block *in, char *out, int *out_size) { - char *cp; - cram_block *b; - - /* Find the external block */ - b = cram_get_block_by_id(slice, c->u.external.content_id); - if (!b) - return *out_size?-1:0; - - cp = (char *)b->data + b->idx; - // E_INT and E_LONG are guaranteed single item queries - int err = 0; - *(int32_t *)out = c->vv->varint_get32s(&cp, (char *)b->data + b->uncomp_size, &err); - b->idx = cp - (char *)b->data; - *out_size = 1; - - return err ? -1 : 0; -} - int cram_external_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { char *cp; @@ -385,26 +383,6 @@ int cram_external_decode_long(cram_slice *slice, cram_codec *c, return err ? -1 : 0; } -int cram_external_decode_slong(cram_slice *slice, cram_codec *c, - cram_block *in, char *out, int *out_size) { - char *cp; - cram_block *b; - - /* Find the external block */ - b = cram_get_block_by_id(slice, c->u.external.content_id); - if (!b) - return *out_size?-1:0; - - cp = (char *)b->data + b->idx; - // E_INT and E_LONG are guaranteed single item queries - int err = 0; - *(int64_t *)out = c->vv->varint_get64s(&cp, (char *)b->data + b->uncomp_size, &err); - b->idx = cp - (char *)b->data; - *out_size = 1; - - return err ? -1 : 0; -} - int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -471,6 +449,7 @@ cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -483,18 +462,35 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT) - c->decode = cram_external_decode_int; - else if (option == E_SINT) - c->decode = cram_external_decode_sint; - else if (option == E_LONG) - c->decode = cram_external_decode_long; - else if (option == E_SLONG) - c->decode = cram_external_decode_slong; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_external_decode_char; - else - c->decode = cram_external_decode_block; + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option == E_BYTE_ARRAY_BLOCK) + c->decode = cram_external_decode_block; + else if (option == E_BYTE || option == E_BYTE_ARRAY) + c->decode = cram_external_decode_char; + else + return NULL; + break; + default: + return NULL; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_external_decode_char; + else + c->decode = cram_external_decode_block; + } c->free = cram_external_decode_free; c->size = cram_external_decode_size; c->get_block = cram_external_get_block; @@ -578,6 +574,7 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_external_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -588,18 +585,32 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT) - c->encode = cram_external_encode_int; - else if (option == E_SINT) - c->encode = cram_external_encode_sint; - else if (option == E_LONG) - c->encode = cram_external_encode_long; - else if (option == E_SLONG) - c->encode = cram_external_encode_slong; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->encode = cram_external_encode_char; - else - abort(); + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option != E_BYTE && option != E_BYTE_ARRAY) + return NULL; + c->encode = cram_external_encode_char; + break; + default: + return NULL; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->encode = cram_external_encode_char; + else + abort(); + } c->store = cram_external_encode_store; c->flush = NULL; @@ -608,6 +619,410 @@ cram_codec *cram_external_encode_init(cram_stats *st, return c; } +/* + * --------------------------------------------------------------------------- + * VARINT + * + * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8 + * format as well as bytes. In CRAM 4 EXTERNAL is only for bytes and + * byte arrays, with two dedicated encodings for integers: + * VARINT_SIGNED and VARINT_UNSIGNED. These also differ a little to + * EXTERNAL with the addition of an offset field, meaning we can store + * values in, say, the range -2 to 1 million without needing to use + * a signed zig-zag transformation. + */ +int cram_varint_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_sint(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_slong(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +void cram_varint_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_varint_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.varint.content_id); +} + +cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch(codec) { + case E_VARINT_UNSIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_int + : cram_varint_decode_long; + break; + case E_VARINT_SIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_sint + : cram_varint_decode_slong; + break; + default: + return NULL; + } + + c->free = cram_varint_decode_free; + c->size = cram_varint_decode_size; + c->get_block = cram_varint_get_block; + + c->u.varint.content_id = vv->varint_get32 (&cp, NULL, NULL); + c->u.varint.offset = vv->varint_get64s(&cp, NULL, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed varint header stream\n"); + free(c); + return NULL; + } + + c->u.varint.type = option; + + return c; +} + +int cram_varint_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +void cram_varint_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id); + tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_varint_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->u.e_varint.offset = 0; + if (st) { + // Marginal difference so far! Not worth the hassle? + if (st->min_val < 0 && st->min_val >= -127 + && st->max_val / -st->min_val > 100) { + c->u.e_varint.offset = -st->min_val; + codec = E_VARINT_UNSIGNED; + } else if (st->min_val > 0) { + c->u.e_varint.offset = -st->min_val; + } + } + + c->codec = codec; + c->free = cram_varint_encode_free; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch (codec) { + case E_VARINT_UNSIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_int + : cram_varint_encode_long; + break; + case E_VARINT_SIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_sint + : cram_varint_encode_slong; + break; + default: + return NULL; + } + c->store = cram_varint_encode_store; + c->flush = NULL; + + c->u.e_varint.content_id = (size_t)dat; + + return c; +} +/* + * --------------------------------------------------------------------------- + * CONST_BYTE and CONST_INT + */ +int cram_const_decode_byte(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out[i] = c->u.xconst.val; + + return 0; +} + +int cram_const_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; + + return 0; +} + +int cram_const_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; + + return 0; +} + +void cram_const_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_const_decode_size(cram_slice *slice, cram_codec *c) { + return 0; +} + +cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + if (codec == E_CONST_BYTE) + c->decode = cram_const_decode_byte; + else if (option == E_INT) + c->decode = cram_const_decode_int; + else + c->decode = cram_const_decode_long; + c->free = cram_const_decode_free; + c->size = cram_const_decode_size; + c->get_block = NULL; + + c->u.xconst.val = vv->varint_get64s(&cp, NULL, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed const header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_const_encode(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_const_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + c->free = cram_const_decode_free; // as as decode + c->encode = cram_const_encode; // a nop + c->store = cram_const_encode_store; + c->flush = NULL; + c->u.e_xconst.val = st->min_val; + + return c; +} + /* * --------------------------------------------------------------------------- * BETA @@ -678,6 +1093,7 @@ void cram_beta_decode_free(cram_codec *c) { cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -780,6 +1196,7 @@ void cram_beta_encode_free(cram_codec *c) { } cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -964,6 +1381,7 @@ cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1127,6 +1545,7 @@ void cram_xpack_encode_free(cram_codec *c) { } cram_codec *cram_xpack_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -1285,6 +1704,7 @@ cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1524,9 +1944,10 @@ void cram_xdelta_encode_free(cram_codec *c) { } cram_codec *cram_xdelta_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version, varint_vec *vv) { + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; if (!(c = malloc(sizeof(*c)))) @@ -1685,6 +2106,7 @@ void cram_xrle_decode_free(cram_codec *c) { cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1906,9 +2328,10 @@ void cram_xrle_encode_free(cram_codec *c) { } cram_codec *cram_xrle_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version, varint_vec *vv) { + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; if (!(c = malloc(sizeof(*c)))) @@ -1998,6 +2421,7 @@ void cram_subexp_decode_free(cram_codec *c) { cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -2063,6 +2487,7 @@ void cram_gamma_decode_free(cram_codec *c) { cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -2279,6 +2704,7 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, */ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { int32_t ncodes = 0, i, j; @@ -2313,6 +2739,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, h->free = cram_huffman_decode_free; h->u.huffman.ncodes = ncodes; + h->u.huffman.option = option; if (ncodes) { codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes)); if (!codes) { @@ -2327,15 +2754,9 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, if (option == E_LONG) { for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get64(&cp, data_end, &err); - } else if (option == E_SLONG) { - for (i = 0; i < ncodes; i++) - codes[i].symbol = vv->varint_get64s(&cp, data_end, &err); } else if (option == E_INT || option == E_BYTE) { for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get32(&cp, data_end, &err); - } else if (option == E_SINT) { - for (i = 0; i < ncodes; i++) - codes[i].symbol = vv->varint_get32s(&cp, data_end, &err); } else { free(h); return NULL; @@ -2641,6 +3062,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_huffman_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -2873,6 +3295,7 @@ void cram_byte_array_len_decode_free(cram_codec *c) { cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -2989,6 +3412,7 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_len_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -3110,6 +3534,7 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -3210,6 +3635,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -3246,6 +3672,12 @@ const char *cram_encoding2str(enum cram_encoding t) { case E_SUBEXP: return "SUBEXP"; case E_GOLOMB_RICE: return "GOLOMB_RICE"; case E_GAMMA: return "GAMMA"; + + case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED"; + case E_VARINT_SIGNED: return "VARINT_SIGNED"; + case E_CONST_BYTE: return "CONST_BYTE"; + case E_CONST_INT: return "CONST_INT"; + case E_NUM_CODECS: default: return "?"; } @@ -3254,18 +3686,39 @@ const char *cram_encoding2str(enum cram_encoding t) { static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) = { - NULL, + // CRAM 3.0 valid codecs + NULL, // null codec cram_external_decode_init, - NULL, + NULL, // golomb cram_huffman_decode_init, cram_byte_array_len_decode_init, cram_byte_array_stop_decode_init, cram_beta_decode_init, cram_subexp_decode_init, - NULL, + NULL, // golomb rice cram_gamma_decode_init, + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_decode_init, // varint unsigned + cram_varint_decode_init, // varint signed + cram_const_decode_init, // const byte + cram_const_decode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_decode_init, + cram_xrle_decode_init, + cram_xdelta_decode_init, }; cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, @@ -3274,7 +3727,8 @@ cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, enum cram_external_type option, int version, varint_vec *vv) { if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) { - cram_codec *r = decode_init[codec](hdr, data, size, option, version, vv); + cram_codec *r = decode_init[codec](hdr, data, size, codec, + option, version, vv); if (r) { r->vv = vv; r->codec_id = hdr->ncodecs++; @@ -3287,19 +3741,40 @@ cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, } static cram_codec *(*encode_init[])(cram_stats *stx, + enum cram_encoding codec, enum cram_external_type option, void *opt, int version, varint_vec *vv) = { - NULL, - cram_external_encode_init, - NULL, + // CRAM 3.0 valid codecs + NULL, // null codec + cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4 + NULL, // golomb cram_huffman_encode_init, cram_byte_array_len_encode_init, cram_byte_array_stop_encode_init, cram_beta_encode_init, - NULL, //cram_subexp_encode_init, - NULL, - NULL, //cram_gamma_encode_init, + NULL, // subexponential (we support decode only) + NULL, // golomb rice + NULL, // gamma (we support decode only) + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_encode_init, // varint unsigned + cram_varint_encode_init, // varint signed + cram_const_encode_init, // const byte + cram_const_encode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_encode_init, + cram_xrle_encode_init, + cram_xdelta_encode_init, }; cram_codec *cram_encoder_init(enum cram_encoding codec, @@ -3310,9 +3785,20 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, if (st && !st->nvals) return NULL; + // cram_stats_encoding assumes integer data, but if option + // is E_BYTE then tweak the requested encoding. This ought + // to be fixed in cram_stats_encoding instead. + if (option == E_BYTE || option == E_BYTE_ARRAY || + option == E_BYTE_ARRAY_BLOCK) { + if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED) + codec = E_EXTERNAL; + else if (codec == E_CONST_INT) + codec = E_CONST_BYTE; + } + if (encode_init[codec]) { cram_codec *r; - if ((r = encode_init[codec](st, option, dat, version, vv))) + if ((r = encode_init[codec](st, codec, option, dat, version, vv))) r->out = NULL; if (!r) { hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec)); @@ -3335,29 +3821,42 @@ int cram_codec_to_id(cram_codec *c, int *id2) { int bnum1, bnum2 = -2; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + bnum1 = -2; // no blocks used + case E_HUFFMAN: bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; break; + case E_GOLOMB: case E_BETA: case E_SUBEXP: case E_GOLOMB_RICE: case E_GAMMA: + // CORE block bnum1 = -1; break; + case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: bnum1 = c->u.external.content_id; break; + case E_BYTE_ARRAY_LEN: bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL); bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL); break; + case E_BYTE_ARRAY_STOP: bnum1 = c->u.byte_array_stop.content_id; break; + case E_NULL: bnum1 = -2; break; + default: hts_log_error("Unknown codec type %d", c->codec); bnum1 = -1; @@ -3384,6 +3883,12 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { int j; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + // shares struct with decode + c->store = cram_const_encode_store; + break; + case E_EXTERNAL: // shares struct with decode c->free = cram_external_encode_free; @@ -3400,6 +3905,23 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return -1; break; + case E_VARINT_SIGNED: + case E_VARINT_UNSIGNED: + // shares struct with decode + c->free = cram_varint_encode_free; + c->store = cram_varint_encode_store; + if (c->decode == cram_varint_decode_int) + c->encode = cram_varint_encode_int; + else if (c->decode == cram_varint_decode_sint) + c->encode = cram_varint_encode_sint; + else if (c->decode == cram_varint_decode_long) + c->encode = cram_varint_encode_long; + else if (c->decode == cram_varint_decode_slong) + c->encode = cram_varint_encode_slong; + else + return -1; + break; + case E_HUFFMAN: { // New structure, so switch. // FIXME: we huffman and e_huffman structs amended, we could @@ -3411,6 +3933,7 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->store = cram_huffman_encode_store; t->u.e_huffman.codes = c->u.huffman.codes; t->u.e_huffman.nvals = c->u.huffman.ncodes; + t->u.e_huffman.option = c->u.huffman.option; for (j = 0; j < t->u.e_huffman.nvals; j++) { int32_t sym = t->u.e_huffman.codes[j].symbol; if (sym >= -1 && sym < MAX_HUFF) @@ -3475,7 +3998,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { case E_BYTE_ARRAY_LEN: { cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; - t->codec = E_BYTE_ARRAY_LEN; + t->vv = c->vv; + t->codec = E_BYTE_ARRAY_LEN; t->free = cram_byte_array_len_encode_free; t->store = cram_byte_array_len_encode_store; t->encode = cram_byte_array_len_encode; diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 850a2a92e..56b065255 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -58,6 +58,7 @@ typedef struct { typedef struct { int ncodes; cram_huffman_code *codes; + int option; } cram_huffman_decoder; #define MAX_HUFF 128 @@ -127,6 +128,12 @@ typedef struct { enum cram_external_type type; } cram_external_decoder; +typedef struct { + int32_t content_id; + int64_t offset; + enum cram_external_type type; +} cram_varint_decoder; + typedef struct { struct cram_codec *len_codec; struct cram_codec *val_codec; @@ -146,6 +153,10 @@ typedef struct { struct cram_codec *val_codec; } cram_byte_array_len_encoder; +typedef struct { + int64_t val; +} cram_const_codec; + /* * A generic codec structure. */ @@ -176,6 +187,8 @@ typedef struct cram_codec { cram_xpack_decoder xpack; cram_xrle_decoder xrle; cram_xdelta_decoder xdelta; + cram_const_codec xconst; + cram_varint_decoder varint; cram_huffman_encoder e_huffman; cram_external_decoder e_external; @@ -185,6 +198,8 @@ typedef struct cram_codec { cram_xpack_decoder e_xpack; cram_xrle_decoder e_xrle; cram_xdelta_decoder e_xdelta; + cram_const_codec e_xconst; + cram_varint_decoder e_varint; } u; } cram_codec; diff --git a/cram/cram_decode.c b/cram/cram_decode.c index e4ee8986b..41203ec7f 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -392,7 +392,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'C' && key[1] == 'F') { ds_id = DS_CF; type = E_INT; } else if (key[0] == 'R' && key[1] == 'I') { - ds_id = DS_RI; type = E_SINT; + ds_id = DS_RI; type = E_INT; } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { @@ -400,11 +400,11 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'R' && key[1] == 'G') { ds_id = DS_RG; - type = is_v4 ? E_SINT : E_INT; + type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { - ds_id = DS_NS; type = E_SINT; + ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; type = is_v4 ? E_LONG : E_INT; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 89c79fa8a..3d9235ac8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -995,8 +995,16 @@ static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { codec->out = s->block[0]; break; + // Codecs which don't use external blocks + case E_CONST_BYTE: + case E_CONST_INT: + codec->out = NULL; + break; + // Codecs that emit directly to external blocks case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) return -1; codec->u.external.content_id = ds_id; @@ -1624,15 +1632,25 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // fd->version); //fprintf(stderr, "=== AP ===\n"); - if (c->pos_sorted) { - h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], - is_v4 ? E_SLONG : E_INT, - NULL, fd->version, &fd->vv); + if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) { + if (c->pos_sorted) + h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), + c->stats[DS_AP], + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); + else + // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT. + h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED + : E_EXTERNAL, + NULL, + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); } else { + // Removed BETA in v4.0. + // Should we consider dropping use of it for 3.0 too? int p[2] = {0, c->max_apos}; h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, - is_v4 ? E_SLONG : E_INT, + is_v4 ? E_LONG : E_INT, p, fd->version, &fd->vv); // cram_xdelta_encoder e; // e.word_size = is_v4 ? 8 : 4; @@ -1648,7 +1666,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), c->stats[DS_RG], - is_v4 ? E_SINT : E_INT, + E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; @@ -1661,7 +1679,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), - c->stats[DS_NS], E_SINT, NULL, + c->stats[DS_NS], E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; @@ -1674,7 +1692,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), c->stats[DS_TS], - is_v4 ? E_SLONG : E_INT, + is_v4 ? E_LONG : E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; @@ -1730,7 +1748,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (CRAM_MAJOR_VERS(fd->version) >= 3) { cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_BB_len; //e.len_dat = (void *)DS_BB; @@ -1783,7 +1803,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), - c->stats[DS_RI], E_SINT, NULL, + c->stats[DS_RI], E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; @@ -1818,7 +1838,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // elements into the same external block. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_SC_len; e.val_encoding = E_EXTERNAL; @@ -2291,8 +2313,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 1) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2311,8 +2338,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 2) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2330,8 +2362,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 4) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2353,7 +2390,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // too. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)sk; // or key+128 for len? e.val_encoding = E_EXTERNAL; diff --git a/cram/cram_stats.c b/cram/cram_stats.c index 20ef51dc7..3ceda0db1 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -193,6 +193,8 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { } st->nvals = nvals; + st->min_val = min_val; + st->max_val = max_val; assert(ntot == st->nsamp); free(vals); @@ -202,7 +204,19 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { * Simple policy that everything is external unless it can be * encoded using zero bits as a unary item huffman table. */ - return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + // Note, we're assuming integer data here as we don't have the + // type passed in. Cram_encoder_init does know the type and + // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate. + if (nvals == 1) + return E_CONST_INT; + else if (nvals == 0 || min_val < 0) + return E_VARINT_SIGNED; + else + return E_VARINT_UNSIGNED; + } else { + return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + } } void cram_stats_free(cram_stats *st) { diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 5c39eca56..b327a06cf 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -97,24 +97,36 @@ typedef struct cram_stats { khash_t(m_i2i) *h; int nsamp; // total number of values added int nvals; // total number of unique values added + int64_t min_val, max_val; } cram_stats; /* NB: matches java impl, not the spec */ enum cram_encoding { E_NULL = 0, - E_EXTERNAL = 1, - E_GOLOMB = 2, - E_HUFFMAN = 3, + E_EXTERNAL = 1, // Only for BYTE type in CRAM 4 + E_GOLOMB = 2, // Not in CRAM 4 + E_HUFFMAN = 3, // Not in CRAM 4 E_BYTE_ARRAY_LEN = 4, E_BYTE_ARRAY_STOP = 5, - E_BETA = 6, - E_SUBEXP = 7, - E_GOLOMB_RICE = 8, - E_GAMMA = 9, - E_XPACK = 11, // Transform to sub-codec - E_XRLE = 12, // Transform to sub-codec - E_XDELTA = 13, // Transform to sub-codec - E_NUM_CODECS, /* Total number of codecs, not a real one. */ + E_BETA = 6, // Not in CRAM 4 + E_SUBEXP = 7, // Not in CRAM 4 + E_GOLOMB_RICE = 8, // Not in CRAM 4 + E_GAMMA = 9, // Not in CRAM 4 + + // CRAM 4 specific codecs + E_VARINT_UNSIGNED = 41, // Specialisation of EXTERNAL + E_VARINT_SIGNED = 42, // Specialisation of EXTERNAL + E_CONST_BYTE = 43, // Alternative to HUFFMAN with 1 symbol + E_CONST_INT = 44, // Alternative to HUFFMAN with 1 symbol + + // More experimental ideas, not documented in spec yet + E_XHUFFMAN = 50, // To external block + E_XPACK = 51, // Transform to sub-codec + E_XRLE = 52, // Transform to sub-codec + E_XDELTA = 53, // Transform to sub-codec + + // Total number of codecs, not a real one. + E_NUM_CODECS, }; enum cram_external_type { From 7068ac84636da735e919173d65d021aeb1afd0e0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 11 Nov 2020 14:27:06 +0000 Subject: [PATCH 048/114] Remove metrics->stats pointer. This has been replaced by an "unpackable" flag. This avoids cram_compress_block2 from needing to access metrics->stats, which is a pointer to the container->stats struct. In rare cases this could be freed prior to the compression completing. --- cram/cram_encode.c | 3 ++- cram/cram_io.c | 7 +++++-- cram/cram_structs.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 3d9235ac8..315279dae 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -855,7 +855,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { pthread_mutex_lock(&fd->metrics_lock); for (i = 0; i < DS_END; i++) - fd->m[i]->stats = c->stats[i]; + if (c->stats[i] && c->stats[i]->nvals > 16) + fd->m[i]->unpackable = 1; pthread_mutex_unlock(&fd->metrics_lock); /* Specific compression methods for certain block types */ diff --git a/cram/cram_io.c b/cram/cram_io.c index 7cbd41c05..65ff7980e 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1949,7 +1949,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, if (metrics) { pthread_mutex_lock(&fd->metrics_lock); if (metrics->trial > 0 || --metrics->next_trial <= 0) { - int m; + int m, unpackable = metrics->unpackable; size_t sz_best = b->uncomp_size; size_t sz[CRAM_MAX_METHOD] = {0}; int method_best = 0; // RAW @@ -1965,10 +1965,11 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, metrics->trial = NTRIALS; for (m = 0; m < CRAM_MAX_METHOD; m++) metrics->sz[m] /= 2; + metrics->unpackable = 0; } // Compress this block using the best method - if (metrics->stats && metrics->stats->nvals > 16) { + if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { // No point trying bit-pack if 17+ symbols. if (method & (1<method = RAW; m->strat = 0; m->revised_method = 0; + m->unpackable = 0; return m; } @@ -4138,6 +4140,7 @@ void reset_metrics(cram_fd *fd) { m->trial = NTRIALS; m->next_trial = TRIAL_SPAN; m->revised_method = 0; + m->unpackable = 0; memset(m->sz, 0, sizeof(m->sz)); } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index b327a06cf..0ede383f5 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -249,7 +249,8 @@ struct cram_metrics { double extra[CRAM_MAX_METHOD]; - cram_stats *stats; + // Not amenable to rANS bit-packing techniques; cardinality > 16 + int unpackable; }; // Hash aux key (XX:i) to cram_metrics From 85240bae84f49c05656b5ffd5b23485c2559f25b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 1 Dec 2020 13:50:54 +0000 Subject: [PATCH 049/114] Further optimisations to probaln_glocal. The changes are: - Replace a lot of the repeated set_u macro usages that go from k to {u, v01, v10, v11} with pointer increments instead. This rather obfuscates things as the difference between v01/u and v11/v10 (+/-3) is now hidden (but see PROBALN_ORIG comment below). - An attempt at manual ordering of code to avoid as many instruction latency issues as possible. Mostly compilers do this already, but some combinations benefit here. - Cache and reuse of variables computed during the previous loop iterations. This can have a big impact, but again whether or not it was already done for us is compiler specific. - Removal of some loop invariants (ideally already done for us, but safer to do it ourselves). - Perform the bi (backwards) rescaling during the backwards calculation loop instead of in a second pass. - Reduce conditionals in the map stage. - Plus some code formatting tidyups, in particular removal of many;statements;on;same;long;line. The original code wasn't exactly easy to understand, but the new code is possibly even worse if that's possible. Hence I've kept the old code in place guarded by a #ifdef PROBALN_ORIG (technically not original as bdf85e4 already did some optimisations) to act as a bit of a translation guide. If we prefer to simply keep this in git history then feel free to nuke the ifdefed bits. Benchmarks vs PROBALN_ORIG (time in this function only) 35% quicker with gcc 7 -O2 (NEW: 1168742 usec, ORIG: 1575843) 14% quicker with gcc 7 -O3 (NEW: 1165683 ORIG: 1333712) 33% quicker with gcc 9 -O2 (NEW: 1169097 ORIG: 1556956) 29% quicker with gcc 9 -O3 (NEW: 1157547 ORIG: 1491839) 24% quicker with clang 7.0 -O2 (NEW: 1207968 ORIG: 1501549) 24% quicker with clang 7.0 -O3 (NEW: 1211660 ORIG: 1504905) On samtools mpileup of 10 million reads this reduced the time from 6m58 to 5m27 (28% faster throughput). This is because 86% of all CPU time was spent in this one function! It's still 81.5% even after optimisation. Note I initially attempted a SIMD implementation, which I think would still be possible, but it cannot be faster without majorly changing the data layout and order of evaluation, plus realistically it'd need AVX or above as SSE4 can only do SIMD on 2 doubles so the overhead is significant. --- probaln.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 152 insertions(+), 16 deletions(-) diff --git a/probaln.c b/probaln.c index 9b9442c5b..73df1a8f1 100644 --- a/probaln.c +++ b/probaln.c @@ -74,8 +74,9 @@ static float g_qual2prob[256]; Returns phred-scaled likelihood score, or INT_MIN on failure. */ -int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, - const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q) +int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, + int l_query, const uint8_t *iqual, const probaln_par_t *c, + int *state, uint8_t *q) { double *f = NULL, *b = NULL, *s = NULL, m[9], sI, sM, bI, bM; float *qual = NULL; @@ -96,8 +97,10 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu bw2 = bw * 2 + 1; size_t i_dim = bw2 < l_ref ? (size_t) bw2*3+6 : (size_t) l_ref*3+6; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - // Ideally these callocs would be mallocs + initialisation of the few bits needed. + // allocate the forward and backward matrices f[][] and b[][] + // and the scaling array s[] + // Ideally these callocs would be mallocs + initialisation of + // the few bits needed. if (SIZE_MAX / (l_query+1) / i_dim < sizeof(double)) { errno = ENOMEM; // Allocation would fail return INT_MIN; @@ -108,7 +111,9 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu b = calloc((l_query+1)*i_dim, sizeof(double)); if (!b) goto fail; } - s = malloc((l_query+2) * sizeof(double)); // s[] is the scaling factor to avoid underflow + + // s[] is the scaling factor to avoid underflow + s = malloc((l_query+2) * sizeof(double)); if (!s) goto fail; // initialize qual @@ -122,11 +127,19 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu qual[i] = g_qual2prob[iqual? iqual[i] : 30]; // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 + // the value here seems not to affect results; FIXME: need proof + sM = sI = 1. / (2 * l_query + 2); + m[0*3+0] = (1 - c->d - c->d) * (1 - sM); + m[0*3+1] = m[0*3+2] = c->d * (1 - sM); + m[1*3+0] = (1 - c->e) * (1 - sI); + m[1*3+1] = c->e * (1 - sI); + m[1*3+2] = 0.; + m[2*3+0] = 1 - c->e; + m[2*3+1] = 0.; + m[2*3+2] = c->e; + bM = (1 - c->d) / l_ref; // (bM+bI)*l_ref==1 + bI = c->d / l_ref; + /*** forward ***/ // f[0] set_u(k, bw, 0, 0); @@ -150,6 +163,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu uint8_t qyi = query[i - 1]; x = i - bw; beg = beg > x? beg : x; // band start x = i + bw; end = end < x? end : x; // band end + + // NB end-beg is almost always 14 (99.9% of the time) + // Hence not a large volume to parallelise. + // + // Maybe stripe in diagonal doing 14 lines together? + // + // Consider rotation? 150x14 vs 14x150 so inner loop + // takes longer. + double E[] = { qli * EM, // 00 1. - qli, // 01 @@ -157,19 +179,69 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., // 11 }; double M = 1./s[i-1]; + + // Note this code has the original version listed here (albeit + // with improved formatting), but we do not compile using + // -DPROBALN_ORIG. The purpose of this code is to act as an + // easier(?) to understand version of the heavily optimised + // version following it and as an easy validation path in case + // of any differences in results. +#ifdef PROBALN_ORIG for (k = beg, sum = 0.; k <= end; ++k) { int u, v11, v01, v10; double e; e = E[(ref[k - 1] > 3 || qyi > 3)*2 + (ref[k - 1] == qyi)]; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); + set_u(u, bw, i, k); + set_u(v11, bw, i-1, k-1); + set_u(v10, bw, i-1, k); + set_u(v01, bw, i, k-1); fi[u+0] = e * (m[0] * M*fi1[v11+0] + m[3] * M*fi1[v11+1] + m[6] * M*fi1[v11+2]); fi[u+1] = EI * (m[1] * M*fi1[v10+0] + m[4] * M*fi1[v10+1]); fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG } +#else + // We use EI*(M*m[1]*? + M*m[4]*?) a lot. So factor it out here. + double xm[5]; + xm[0] = M*m[0]; + xm[1] = M*m[3]; + xm[2] = M*m[6]; + xm[3] = EI*M*m[1]; + xm[4] = EI*M*m[4]; + + { + int u, v11; + set_u(u, bw, i, beg); + set_u(v11, bw, i-1, beg-1); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &fi[u]; + double *yi = &fi1[v11]; + // Derived from xi[0,2] in previous loop iter. + double l_x0 = m[2]*xi[0]; + double l_x2 = m[8]*xi[2]; + for (k = beg, sum = 0.; k <= end; ++k, xi+=3, yi+=3) { + int cond = (ref[k-1] > 3 || qyi > 3)*2 + (ref[k-1] == qyi); + + double z0 = xm[0]*yi[0]; + double z1 = xm[1]*yi[1]; + double z2 = xm[2]*yi[2]; + double z3 = xm[3]*yi[3]; + double z4 = xm[4]*yi[4]; + + xi[0] = E[cond] * (z0+z1+z2); + xi[1] = z3 + z4; + xi[2] = l_x0 + l_x2; + sum += xi[0] + xi[1] + xi[2]; + + l_x0 = m[2]*xi[0]; + l_x2 = m[8]*xi[2]; + } + } +#endif s[i] = sum; } + { // f[l_query+1] double sum; double M = 1./s[l_query]; @@ -205,7 +277,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu } // b[l_query-1..1] for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; + int beg = 1, end = l_ref, x; double *bi = &b[i*i_dim], *bi1 = &b[(i+1)*i_dim], y = (i > 1), qli1 = qual[i]; uint8_t qyi1 = query[i]; x = i - bw; beg = beg > x? beg : x; @@ -217,10 +289,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., //011 //0,0,0,0 //1xx }; + +#ifdef PROBALN_ORIG for (k = end; k >= beg; --k) { int u, v11, v01, v10; double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); + set_u(u, bw, i, k); + set_u(v11, bw, i+1, k+1); + set_u(v10, bw, i+1, k); + set_u(v01, bw, i, k+1); e = (k>=l_ref)?0 :E[(ref[k] > 3 || qyi1 > 3)*2 + (ref[k] == qyi1)] * bi1[v11]; bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; @@ -228,8 +305,43 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG } // rescale + int _beg, _end; set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; +#else + { + int u, v10; + set_u(u, bw, i, end); + set_u(v10, bw, i+1, end); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &bi[u]; + double *yi = &bi1[v10]; + // NB xi[5] is equiv to v01+2. + double xi_5 = xi[5]; + // Manual loop invariant removal + double e1 = EI*m[1]; + double e4 = EI*m[4]; + // Do renorm too in the same pass. + double n = 1./s[i]; + for (k = end; k >= beg; --k, xi -= 3, yi -= 3) { + double e = (k>=l_ref) + ? 0 + : E[(ref[k]>3 || qyi1>3)*2 + (ref[k] == qyi1)] * yi[3]; + + xi[1] = e * m[3] + e4 * yi[1]; + xi[0] = e * m[0] + e1 * yi[1] + m[2] * xi_5; + xi[2] = (e * m[6] + m[8] * xi_5) * y; + // bi[u+2] from this iter becomes bi[v01+2] in next iter + xi_5 = xi[2]; + + // rescale + xi[1] *= n; + xi[0] *= n; + xi[2] *= n; + } + } +#endif } { // b[0] int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; @@ -251,13 +363,36 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu x = i - bw; beg = beg > x? beg : x; x = i + bw; end = end < x? end : x; double M = 1./s[i]; +#ifdef PROBALN_ORIG for (k = beg; k <= end; ++k) { int u; double z; set_u(u, bw, i, k); - z = M*fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = M*fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; + z = M*fi[u+0] * bi[u+0]; + if (z > max) max = z, max_k = (k-1)<<2 | 0; + sum += z; + z = M*fi[u+1] * bi[u+1]; + if (z > max) max = z, max_k = (k-1)<<2 | 1; + sum += z; + } +#else + { + int u; + set_u(u, bw, i, beg); + for (k = beg; k <= end; ++k, u+=3) { + double z1, z2; + z1 = M*fi[u+0] * bi[u+0]; + z2 = M*fi[u+1] * bi[u+1]; + int which = z2 > z1; // strictly z2 >= z1 matches old code + double zm = which ? z2 : z1; + if (zm > max) { + max = zm; + max_k = (k-1)<<2 | which; + } + sum += z1 + z2; + } } +#endif max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state) state[i-1] = max_k; if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; @@ -268,6 +403,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu "ACGT"[query[i - 1]], "ACGT"[ref[(max_k>>2)]], max_k&3, max); // DEBUG #endif } + /*** free ***/ free(f); free(b); free(s); free(qual); return Pr; From c1245117caa14a7fb05b385e9d981c2ea950edfc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 15 Dec 2020 17:15:24 +0000 Subject: [PATCH 050/114] Split cram_block_method enum into public and private. Also the public version has had the enum members renamed to include an HTS_ prefix. At present they appear to be unused, although it's hard to know for sure with some of them due to the common hits you get all over github. Leaving them as they are though is just waiting for a hard problem to arrive. This commit introduces the notion of HTS_COMPAT. The ABI changes changed, but the API has (albeit in a way we believe should affect people). It's a 3+ digit number with the last two being minor version and earlier being major. So htslib version 1.12 would be 112. The intention is simply to permit packages with dependencies on specific earlier API names to still compile with the minimal of changes (even if it's just by overriding CFLAGS). --- cram/cram_io.c | 11 ++++---- cram/cram_io.h | 2 +- cram/cram_structs.h | 60 ++++++++++++++++++++++++++++++++++++++---- htslib/cram.h | 63 ++++++++++++++------------------------------- 4 files changed, 82 insertions(+), 54 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 65ff7980e..4f95f940a 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1730,7 +1730,7 @@ int cram_uncompress_block(cram_block *b) { static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int content_id, size_t *out_size, - enum cram_block_method method, + enum cram_block_method_int method, int level, int strat) { switch (method) { case GZIP: @@ -1899,7 +1899,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Internally we have parameterised methods that externally map // to the same CRAM method value. - // See enum_cram_block_method. + // See enum_cram_block_method_int in cram_structs.h. int methmap[] = { // Externally defined values RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, @@ -2056,7 +2056,8 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int best_method = RAW; int best_sz = INT_MAX; - // Relative costs of methods. See enum_cram_block_method and methmap + // Relative costs of methods. See enum_cram_block_method_int + // and methmap double meth_cost[32] = { // Externally defined methods 1, // 0 raw @@ -2090,7 +2091,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, 1.04, // arith_pr1 1.04, // arith_pr64 - 1.04, // arith_pr65 + 1.04, // arith_pr9 1.03, // arith_pr128 1.04, // arith_pr129 1.04, // arith_pr192 @@ -2242,7 +2243,7 @@ cram_metrics *cram_new_metrics(void) { return m; } -char *cram_block_method2str(enum cram_block_method m) { +char *cram_block_method2str(enum cram_block_method_int m) { switch(m) { case RAW: return "RAW"; case GZIP: return "GZIP"; diff --git a/cram/cram_io.h b/cram/cram_io.h index 7d787e44c..8cc59be51 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -173,7 +173,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int method, int level); cram_metrics *cram_new_metrics(void); -char *cram_block_method2str(enum cram_block_method m); +char *cram_block_method2str(enum cram_block_method_int m); char *cram_content_type2str(enum cram_content_type t); /* diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 0ede383f5..0c3ae8542 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -209,11 +209,61 @@ struct cram_file_def { struct cram_slice; -/* Now in htslib/cram.h -enum cram_block_method { - ... +// Internal version of htslib/cram.h enum. +// Note these have to match the laout of methmap and methcost in +// cram_io.c:cram_compress_block2 +enum cram_block_method_int { + // Public methods as defined in the CRAM spec. + BM_ERROR = -1, + + // CRAM 2.x and 3.0 + RAW = 0, + GZIP = 1, + BZIP2 = 2, + LZMA = 3, + RANS = 4, RANS0 = RANS, + + // CRAM 3.1 onwards + RANSPR = 5, RANS_PR0 = RANSPR, + ARITH = 6, ARITH_PR0 = ARITH, + FQZ = 7, + TOK3 = 8, + // BSC = 9, ZSTD = 10 + + // Methods not externalised, but used in metrics. + // Externally they become one of the above methods. + GZIP_RLE = 11, + GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM + + FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods + + //RANS0, // Order 0 + RANS1, + + //RANS_PR0, // Order 0 + RANS_PR1, // Order 1 + RANS_PR64, // O0 + RLE + RANS_PR9, // O1 + X4 + RANS_PR128, // O0 + Pack + RANS_PR129, // O1 + Pack + RANS_PR192, // O0 + RLE + pack + RANS_PR193, // O1 + RLE + pack + + //TOK3, // tok+rans + TOKA, // tok+arith + + //ARITH_PR0, // Order 0 + ARITH_PR1, // Order 1 + ARITH_PR64, // O0 + RLE + ARITH_PR9, // O1 + X4 + ARITH_PR128, // O0 + Pack + ARITH_PR129, // O1 + Pack + ARITH_PR192, // O0 + RLE + pack + ARITH_PR193, // O1 + RLE + pack + + // NB: must end on no more than 31 unless we change to a + // 64-bit method type. }; -*/ /* Now in htslib/cram.h enum cram_content_type { @@ -259,7 +309,7 @@ KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) /* Block */ struct cram_block { - enum cram_block_method method, orig_method; + enum cram_block_method_int method, orig_method; enum cram_content_type content_type; int32_t content_id; int32_t comp_size; diff --git a/htslib/cram.h b/htslib/cram.h index 5446945e4..607cf8bcf 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -47,6 +47,17 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +// see cram/cram_structs.h for an internal more complete copy of this enum + +// Htslib 1.11 had these listed without any hts prefix, and included +// some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever +// been public). +// +// We can't find evidence of these being used and the data type occurs +// nowhere in functions or structures meaning using it would be pointless. +// However for safety, if you absolute need the API to not change then +// define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11). +#if defined(HTS_COMPAT) && HTS_COMPAT <= 101100 enum cram_block_method { // Public methods as defined in the CRAM spec. BM_ERROR = -1, @@ -56,50 +67,16 @@ enum cram_block_method { GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, RANS0 = RANS, - - // CRAM 3.1 onwards - RANSPR = 5, RANS_PR0 = RANSPR, - ARITH = 6, ARITH_PR0 = ARITH, - FQZ = 7, - TOK3 = 8, - // BSC = 9, ZSTD = 10 - - // Methods not externalised, but used in metrics. - // Externally they become one of the above methods. - GZIP_RLE = 11, // NB: not externalised in CRAM - GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM - - FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods - - //RANS0, // Order 0 - RANS1, - - //RANS_PR0, // Order 0 - RANS_PR1, // Order 1 - RANS_PR64, // O0 + RLE - RANS_PR9, // O1 + X4 - RANS_PR128, // O0 + Pack - RANS_PR129, // O1 + Pack - RANS_PR192, // O0 + RLE + pack - RANS_PR193, // O1 + RLE + pack - - //TOK3, // tok+rans - TOKA, // tok+arith - - //ARITH_PR0, // Order 0 - ARITH_PR1, // Order 1 - ARITH_PR64, // O0 + RLE - ARITH_PR9, // O1 + X4 - ARITH_PR128, // O0 + Pack - ARITH_PR129, // O1 + Pack - ARITH_PR192, // O0 + RLE + pack - ARITH_PR193, // O1 + RLE + pack - - // NB: must end on no more than 31 unless we change to a - // 64-bit method type. - + RANS = 4, + + // NB: the subsequent numbers may change. They're simply here for + // compatibility with the old API, but may have no bearing on the + // internal way htslib works. DO NOT USE + RANS0 = 4, + RANS1 = 10, + GZIP_RLE = 11, }; +#endif enum cram_content_type { CT_ERROR = -1, From 878c71bf0ec52ed66666a37e84241a62c474fecd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 16 Dec 2020 20:40:20 +0000 Subject: [PATCH 051/114] Get htslib's "make test" to run the htscodecs tests They are included by adding extra dependencies to the "test" and "check" targets when htscodecs is bundled via the submodule. --- Makefile | 57 ++++++++++++++++++++++++++++++++++++++++++- htscodecs_bundled.mk | 6 +++++ htscodecs_external.mk | 1 + 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4168c0463..5b447cbce 100644 --- a/Makefile +++ b/Makefile @@ -426,7 +426,7 @@ maintainer-check: # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) +check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODECS_TEST_TARGETS) test/hts_endian test/test_expr test/test_kfunc @@ -508,6 +508,56 @@ test/test-bcf-sr: test/test-bcf-sr.o libhts.a test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread +# Extra tests for bundled htscodecs +test_htscodecs_rans4x8: htscodecs/tests/rans4x8 + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test + +test_htscodecs_rans4x16: htscodecs/tests/rans4x16pr + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x16.test + +test_htscodecs_arith: htscodecs/tests/arith_dynamic + cd htscodecs/tests && srcdir=. && export srcdir && ./arith.test + +test_htscodecs_tok3: htscodecs/tests/tokenise_name3 + cd htscodecs/tests && srcdir=. && export srcdir && ./tok3.test + +test_htscodecs_fqzcomp: htscodecs/tests/fqzcomp_qual + cd htscodecs/tests && srcdir=. && export srcdir && ./fqzcomp.test + +test_htscodecs_varint: htscodecs/tests/varint + cd htscodecs/tests && ./varint + +htscodecs/tests/arith_dynamic: htscodecs/tests/arith_dynamic_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/fqzcomp_qual: htscodecs/tests/fqzcomp_qual_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x16pr: htscodecs/tests/rANS_static4x16pr_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x8: htscodecs/tests/rANS_static_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/tokenise_name3: htscodecs/tests/tokenise_name3_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) +htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) +htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) +htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) +htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) +htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) + test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) @@ -629,14 +679,17 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -rf htscodecs/tests/test.out mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files + -rm -f htscodecs/tests/*.o clean: mostlyclean clean-$(SHLIB_FLAVOUR) -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) + -rm -f htscodecs/tests/rans4x8 htscodecs/tests/rans4x16pr htscodecs/tests/arith_dynamic htscodecs/tests/tokenise_name3 htscodecs/tests/fqzcomp_qual htscodecs/tests/varint distclean maintainer-clean: clean -rm -f config.cache config.h config.log config.mk config.status @@ -683,3 +736,5 @@ force: .PHONY: clean-cygdll install-cygdll .PHONY: clean-dll install-dll .PHONY: clean-dylib install-dylib +.PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith +.PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d40bbd095..5a8328e0e 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -7,3 +7,9 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) + +# Add htscodecs tests into the HTSlib test framework + +HTSCODECS_TEST_TARGETS = test_htscodecs_rans4x8 \ + test_htscodecs_rans4x16 test_htscodecs_arith test_htscodecs_tok3 \ + test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/htscodecs_external.mk b/htscodecs_external.mk index f1d82faf8..7ee47402c 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,5 +1,6 @@ HTSCODECS_SOURCES = HTSCODECS_OBJS = +HTSCODECS_TEST_TARGETS = htscodecs_arith_dynamic_h = htscodecs_fqzcomp_qual_h = From 1fc55428dfffbc36609798bb823be51f733bb55a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 5 Jan 2021 15:52:12 +0000 Subject: [PATCH 052/114] Add configure check for the htscodecs submodule So that anyone who tries to build the library without setting up the submodule first will get a hint on how to make the build work. The check isn't really needed for anyone building from a release tarball, but it will be harmless as it should always succeed (as long as the tarball was built correctly and includes the htscodecs files). --- configure.ac | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 890d7aa31..f990c370b 100644 --- a/configure.ac +++ b/configure.ac @@ -306,7 +306,26 @@ if necessary set CPPFLAGS and LDFLAGS so the compiler can find them; or configure using --without-external-htscodecs to build the required functions from the htscodecs submodule. ])])], - [selected_htscodecs_mk="htscodecs_bundled.mk"]) + [AC_MSG_CHECKING([whether htscodecs files are present]) + AS_IF([test -e "$srcdir/htscodecs/htscodecs/rANS_static4x16.h"], + [AC_MSG_RESULT([yes]) + selected_htscodecs_mk="htscodecs_bundled.mk"], + [AC_MSG_RESULT([no]) + AS_IF([test -e "$srcdir/.git"], + [MSG_ERROR([htscodecs submodule files not present. + +HTSlib uses some functions from the htscodecs project, which is normally +included as a submodule. Try running: + + git submodule update --init --recursive + +to update it, and then re-run configure. +])], + [MSG_ERROR([htscodecs submodule files not present. + +You have an incomplete distribution. Please try downloading one of the +official releases from https://www.htslib.org +])])])]) AC_SUBST([selected_htscodecs_mk]) AS_IF([test "x$with_libdeflate" != "xno"], From f895c674f5f946f291d6c9f8d3fe956bab485800 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 5 Jan 2021 18:54:25 +0000 Subject: [PATCH 053/114] Add Makefile check for the htscodecs submodule Currently if the submodule files are missing it asks the user to run "git submodule update" to install them. It's possible that the Makefile run this itself and continue, although I'm not sure if that would be completely safe in parallel builds. The variables lising dependencies for htscodecs header files are moved to htscodecs_bundled.mk to ensure they are only set for submodule builds. --- Makefile | 34 ++++++++++++++++++++-------------- htscodecs_bundled.mk | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 5b447cbce..5558799cb 100644 --- a/Makefile +++ b/Makefile @@ -212,20 +212,6 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) -htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h -htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h -htscodecs_pack_h = htscodecs/htscodecs/pack.h -htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h -htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h -htscodecs_rle_h = htscodecs/htscodecs/rle.h -htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h -htscodecs_varint_h = htscodecs/htscodecs/varint.h - -htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h -htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h -htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) -htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h - # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set # variables in its own rules' prerequisites and also update variables for use @@ -414,6 +400,26 @@ bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) +# Runes to check that the htscodecs submodule is present +ifdef HTSCODECS_SOURCES +htscodecs/htscodecs/%.c: | htscodecs/htscodecs + ; +htscodecs/htscodecs/%.h: | htscodecs/htscodecs + ; +htscodecs/htscodecs: + @if test -e .git ; then \ + printf "\\n\\nError: htscodecs submodule files not present.\\n\ + Try running: \\n\ + git submodule update --init --recursive\\n\ + and then re-run make.\\n\\n\\n" ; \ + else \ + printf "\\n\\nError: htscodecs submodule files not present and this is not a git checkout.\\n\ + You have an incomplete distribution. Please try downloading one of the\\n\ + official releases from https://www.htslib.org/\\n" ; \ + fi + @false +endif + # Maintainer source code checks # - copyright boilerplate presence # - tab and trailing space detection diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 5a8328e0e..d270773f7 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -8,6 +8,20 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) +htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h +htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_pack_h = htscodecs/htscodecs/pack.h +htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h +htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_rle_h = htscodecs/htscodecs/rle.h +htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h +htscodecs_varint_h = htscodecs/htscodecs/varint.h + +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h +htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h + # Add htscodecs tests into the HTSlib test framework HTSCODECS_TEST_TARGETS = test_htscodecs_rans4x8 \ From 67805c7811a758da6a1e3ceef06004378de23460 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 2 Nov 2020 12:36:56 +0000 Subject: [PATCH 054/114] Added some simple introspection functions. These permit testing of explicit features, e.g. if (htslib_test_feature(HTS_FEATURE_PLUGINS)) { ... } or capturing the entire feature bit-field via htslib_features(). (This function may be redundant so perhaps is a candidate for culling?). This also permits querying compilation details: CC, CFLAGS, LDFLAGS and CPPFLAGS. Finally there is a htslib_feature_string() function which is mainly for verbose feature printing rather than programmatic parsing, but it conveniently also permits us to get features from a library simply by running "strings" on the binary. Note this doesn't actually do any testing so it's not in the Makefile. It's hard to know quite what testing it could do given by design the output would differ based on how the user built it. We could maybe call it to make sure it doesn't crash, but that's not likely to be something we need regression testing on. --- Makefile | 9 ++- hts_internal.h | 1 + hts_os.c | 147 ++++++++++++++++++++++++++++++++++++++ htslib/hts_os.h | 39 ++++++++++ plugin.c | 27 +++++++ test/test_introspection.c | 62 ++++++++++++++++ 6 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 test/test_introspection.c diff --git a/Makefile b/Makefile index eac4b809d..f598f66cc 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,8 @@ BUILT_TEST_PROGRAMS = \ test/test-bcf-sr \ test/fuzz/hts_open_fuzzer.o \ test/test-bcf-translate \ - test/test-parse-reg + test/test-parse-reg \ + test/test_introspection BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -341,6 +342,8 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c +hts_os.o hts_os.pico: TMP_CPPFLAGS := $(CPPFLAGS) +hts_os.o hts_os.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) @@ -481,6 +484,9 @@ test/test-bcf-sr: test/test-bcf-sr.o libhts.a test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread +test/test_introspection: test/test_introspection.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread + test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) @@ -502,6 +508,7 @@ test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) +test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hts_os_h) test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/hts_internal.h b/hts_internal.h index 4e5e20151..5ee88b7d0 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -104,6 +104,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * void *plugin_sym(void *plugin, const char *name, const char **errmsg); plugin_void_func *plugin_func(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); +const char *htslib_plugin_path(void); /* * Buffers up arguments to hts_idx_push for later use, once we've written all bar diff --git a/hts_os.c b/hts_os.c index c26700975..c533c8cd9 100644 --- a/hts_os.c +++ b/hts_os.c @@ -57,3 +57,150 @@ long hts_lrand48(void) { return lrand48(); } // #define USE_FILEEXTD // #include "os/iscygpty.c" // #endif + + +#include +#include +#include "hts_internal.h" +#include "htslib/hts.h" +#include "htslib/hts_os.h" +#include "htslib/kstring.h" + +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + +// Plus hts_version here? diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 2f988c3c9..eab438b54 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,4 +77,43 @@ extern int is_cygpty(int fd); #define random rand #endif +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1<<31) + #endif diff --git a/plugin.c b/plugin.c index d5c1981ca..fccc83793 100644 --- a/plugin.c +++ b/plugin.c @@ -191,3 +191,30 @@ void close_plugin(void *plugin) __func__, dlerror()); } } + +const char *htslib_plugin_path(void) { +#ifdef ENABLE_PLUGINS + char *path = getenv("HTS_PATH"); + if (!path) path = ""; + + kstring_t ks = {0}; + while(1) { + size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); + if (len == 0) kputs(PLUGINPATH, &ks); + else kputsn(path, len, &ks); + kputc(HTS_PATH_SEPARATOR_CHAR, &ks); + + path += len; + if (*path == HTS_PATH_SEPARATOR_CHAR) path++; + else break; + } + + static char s_path[1024]; + sprintf(s_path, "%.1023s", ks.s ? ks.s : ""); + free(ks.s); + + return s_path; +#else + return NULL; +#endif +} diff --git a/test/test_introspection.c b/test/test_introspection.c new file mode 100644 index 000000000..6bce199dc --- /dev/null +++ b/test/test_introspection.c @@ -0,0 +1,62 @@ +/* test/test_introspection.c -- demonstration of introspection function usage + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#include "../htslib/hts.h" +#include "../htslib/hts_os.h" + +int main(void) { + printf("Version string: %s\n", hts_version()); + printf("Version number: %d\n", HTS_VERSION); + + printf("\nCC: %s\n", htslib_test_feature(HTS_FEATURE_CC)); + printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); + printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); + printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); + + unsigned int feat = htslib_features(); + printf("\nFeature number: 0x%x\n", feat); + if (feat & HTS_FEATURE_CONFIGURE) + printf(" HTS_FEATURE_CONFIGURE\n"); + if (feat & HTS_FEATURE_PLUGINS) + printf(" HTS_FEATURE_PLUGINS\n"); + if (feat & HTS_FEATURE_LIBCURL) + printf(" HTS_FEATURE_LIBCURL\n"); + if (feat & HTS_FEATURE_S3) + printf(" HTS_FEATURE_S3\n"); + if (feat & HTS_FEATURE_GCS) + printf(" HTS_FEATURE_GCS\n"); + if (feat & HTS_FEATURE_LIBDEFLATE) + printf(" HTS_FEATURE_LIBDEFLATE\n"); + if (feat & HTS_FEATURE_LZMA) + printf(" HTS_FEATURE_LZMA\n"); + if (feat & HTS_FEATURE_BZIP2) + printf(" HTS_FEATURE_BZIP2\n"); + + printf("\nFeature string: %s\n", htslib_feature_string()); + + return 0; +} From cd48384237cd63d5b1e1a42e1946e6e59143f790 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 15:24:06 +0000 Subject: [PATCH 055/114] Added a plugin and scheme query API --- hfile.c | 105 ++++++++++++++++++++++++++++++++++++++ hts_os.c | 2 +- htslib/hfile.h | 48 +++++++++++++++++ test/test_introspection.c | 24 ++++++++- 4 files changed, 177 insertions(+), 2 deletions(-) diff --git a/hfile.c b/hfile.c index 218520a7c..35b4adac7 100644 --- a/hfile.c +++ b/hfile.c @@ -1142,6 +1142,111 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme; } + +/* + * Fills out sc_list[] with the list of known schemes. + * This can be restricted to just ones from a specific plugin, + * or all (plugin == NULL). + * + * Returns number of schemes found on success; + * -1 on failure. + */ +HTSLIB_EXPORT +int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + khiter_t k; + int ns = 0; + + for (k = kh_begin(schemes); k != kh_end(schemes); k++) { + if (!kh_exist(schemes, k)) + continue; + + const struct hFILE_scheme_handler *s = kh_value(schemes, k); + if (plugin && strcmp(s->provider, plugin) != 0) + continue; + + if (ns < *nschemes) + sc_list[ns] = kh_key(schemes, k); + ns++; + } + + if (*nschemes > ns) + *nschemes = ns; + + return ns; +} + + +/* + * Fills out plist[] with the list of known plugins. + * + * Returns number of schemes found on success; + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_plugins(const char *plist[], int *nplugins) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + int np = 0; + if (*nplugins) + plist[np++] = "built-in"; + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (np < *nplugins) + plist[np] = p->plugin.name; + + p = p->next; + np++; + } + + if (*nplugins > np) + *nplugins = np; + + return np; +} + + +/* + * Tests for the presence of a specific plugin. + * + * Returns 1 if true + * 0 otherwise + */ +HTSLIB_EXPORT +int htslib_has_plugin(const char *name) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (strcmp(p->plugin.name, name) == 0) + return 1; + p = p->next; + } + + return 0; +} + + hFILE *hopen(const char *fname, const char *mode, ...) { const struct hFILE_scheme_handler *handler = find_scheme_handler(fname); diff --git a/hts_os.c b/hts_os.c index c533c8cd9..adcf222c9 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019, 2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/hfile.h b/htslib/hfile.h index 08d3edf7e..9ca489d2d 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,6 +316,54 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); +/// Fills out sc_list[] with the list of known schemes. +/** + * @param plugin [in] Restricts schemes to only those from 'plugin. + * @param sc_list [out] Filled out with the scheme names + * @param nschemes [in/out] Size of sc_list (in) and number returned (out) + * + * Plugin may be passed in as NULL in which case all schemes are returned. + * Use plugin "built-in" to list the built in schemes. + * The size of sc_list is determined by the input value of *nschemes. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found matching plugin, which + * may be larger than *nschemes if too small a value was specified. + * + * @return the number of schemes found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); + +/// Fills out plist[] with the list of known plugins. +/* + * @param plist [out] Filled out with the plugin names + * @param nplugins [in/out] Size of plist (in) and number returned (out) + * + * The size of plist is determined by the input value of *nplugins. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found, which may be + * larger than *nplugins if too small a value was specified. + * + * @return the number of plugins found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_plugins(const char *plist[], int *nplugins); + +/// Tests for the presence of a specific plugin. +/* + * @param name The name of the plugin to query. + * + * @return 1 if found, 0 otherwise. + */ +HTSLIB_EXPORT +int htslib_has_plugin(const char *name); + #ifdef __cplusplus } #endif diff --git a/test/test_introspection.c b/test/test_introspection.c index 6bce199dc..ea90430c4 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "../htslib/hts.h" +#include "../htslib/hfile.h" #include "../htslib/hts_os.h" int main(void) { @@ -36,7 +37,7 @@ int main(void) { printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); - + unsigned int feat = htslib_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) @@ -58,5 +59,26 @@ int main(void) { printf("\nFeature string: %s\n", htslib_feature_string()); + + // Plugins and schemes + printf("\nPlugins present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hts_list_plugins(plugins, &np) < 0) + return 1; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hts_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return 1; + + printf(" %s:\n", plugins[i]); + for (j = 0; j < nschemes; j++) + printf("\t%s\n", sc_list[j]); + puts(""); + } + return 0; } From b5ed0101b6d4d8345cc0211ea880537c003c461b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 16:18:44 +0000 Subject: [PATCH 056/114] Moved the introspection code from hts_os.[ch] to hfile.[ch]. Some of it needs to be there anyway for the scheme and plugin APIs. --- Makefile | 6 +- hfile.c | 147 +++++++++++++++++++++++++++++++++++++ hts_os.c | 149 +------------------------------------- htslib/hfile.h | 39 ++++++++++ htslib/hts_os.h | 41 +---------- test/test_introspection.c | 1 - 6 files changed, 191 insertions(+), 192 deletions(-) diff --git a/Makefile b/Makefile index f598f66cc..b02cbf0a9 100644 --- a/Makefile +++ b/Makefile @@ -335,6 +335,8 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) +hfile.o hfile.pico: TMP_CPPFLAGS := $(CPPFLAGS) +hfile.o hfile.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) @@ -342,8 +344,6 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -hts_os.o hts_os.pico: TMP_CPPFLAGS := $(CPPFLAGS) -hts_os.o hts_os.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) @@ -508,7 +508,7 @@ test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) -test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hts_os_h) +test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/hfile.c b/hfile.c index 35b4adac7..e5861c25c 100644 --- a/hfile.c +++ b/hfile.c @@ -1143,6 +1143,10 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) } +/*************************** + * Library introspection functions + ***************************/ + /* * Fills out sc_list[] with the list of known schemes. * This can be restricted to just ones from a specific plugin, @@ -1246,6 +1250,149 @@ int htslib_has_plugin(const char *name) return 0; } +HTSLIB_EXPORT +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +HTSLIB_EXPORT +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +HTSLIB_EXPORT +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + +/*************************** + * hFILE interface proper + ***************************/ hFILE *hopen(const char *fname, const char *mode, ...) { diff --git a/hts_os.c b/hts_os.c index adcf222c9..c26700975 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019, 2020 Genome Research Ltd. + Copyright (C) 2017, 2019 Genome Research Ltd. Author: James Bonfield @@ -57,150 +57,3 @@ long hts_lrand48(void) { return lrand48(); } // #define USE_FILEEXTD // #include "os/iscygpty.c" // #endif - - -#include -#include -#include "hts_internal.h" -#include "htslib/hts.h" -#include "htslib/hts_os.h" -#include "htslib/kstring.h" - -unsigned int htslib_features(void) { - unsigned int feat = 0; - -#ifdef PACKAGE_URL - feat |= HTS_FEATURE_CONFIGURE; -#endif - -#ifdef ENABLE_PLUGINS - feat |= HTS_FEATURE_PLUGINS; -#endif - -#ifdef HAVE_LIBCURL - feat |= HTS_FEATURE_LIBCURL; -#endif - -#ifdef ENABLE_S3 - feat |= HTS_FEATURE_S3; -#endif - -#ifdef ENABLE_GCS - feat |= HTS_FEATURE_GCS; -#endif - -#ifdef HAVE_LIBDEFLATE - feat |= HTS_FEATURE_LIBDEFLATE; -#endif - -#ifdef HAVE_LIBLZMA - feat |= HTS_FEATURE_LZMA; -#endif - -#ifdef HAVE_LIBBZ2 - feat |= HTS_FEATURE_BZIP2; -#endif - - return feat; -} - -const char *htslib_test_feature(int id) { - int feat = htslib_features(); - - switch (id) { - case HTS_FEATURE_CONFIGURE: - return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; - case HTS_FEATURE_PLUGINS: - return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; - case HTS_FEATURE_LIBCURL: - return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; - case HTS_FEATURE_S3: - return feat & HTS_FEATURE_S3 ? "yes" : NULL; - case HTS_FEATURE_GCS: - return feat & HTS_FEATURE_GCS ? "yes" : NULL; - case HTS_FEATURE_LIBDEFLATE: - return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; - case HTS_FEATURE_BZIP2: - return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; - case HTS_FEATURE_LZMA: - return feat & HTS_FEATURE_LZMA ? "yes" : NULL; - - case HTS_FEATURE_CC: - return HTS_CC; - case HTS_FEATURE_CFLAGS: - return HTS_CFLAGS; - case HTS_FEATURE_LDFLAGS: - return HTS_LDFLAGS; - case HTS_FEATURE_CPPFLAGS: - return HTS_CPPFLAGS; - - default: - fprintf(stderr, "Unknown feature code: %d\n", id); - } - - return NULL; -} - -// Note this implementation also means we can just "strings" the library -// to find the configuration parameters. -const char *htslib_feature_string(void) { - const char *fmt= - -#ifdef PACKAGE_URL - "build=configure " -#else - "build=Makefile " -#endif - -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - -#ifdef HAVE_LIBCURL - "libcurl=yes " -#else - "libcurl=no " -#endif - -#ifdef ENABLE_S3 - "S3=yes " -#else - "S3=no " -#endif - -#ifdef ENABLE_GCS - "GCS=yes " -#else - "GCS=no " -#endif - -#ifdef HAVE_LIBDEFLATE - "libdeflate=yes " -#else - "libdeflate=no " -#endif - -#ifdef HAVE_LIBLZMA - "lzma=yes " -#else - "lzma=no " -#endif - -#ifdef HAVE_LIBBZ2 - "bzip2=yes "; -#else - "bzip2=no "; -#endif - -#ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); - return config; -#else - return fmt; -#endif -} - -// Plus hts_version here? diff --git a/htslib/hfile.h b/htslib/hfile.h index 9ca489d2d..d960ab3db 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -364,6 +364,45 @@ int hts_list_plugins(const char *plist[], int *nplugins); HTSLIB_EXPORT int htslib_has_plugin(const char *name); +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1<<31) + #ifdef __cplusplus } #endif diff --git a/htslib/hts_os.h b/htslib/hts_os.h index eab438b54..3461df512 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,43 +77,4 @@ extern int is_cygpty(int fd); #define random rand #endif -/*! @abstract Introspection on the features enabled in htslib - * - * @return a bitfield of HTS_FEATURE_* macros. - */ -HTSLIB_EXPORT -unsigned int htslib_features(void); - -HTSLIB_EXPORT -const char *htslib_test_feature(int id); - -/*! @abstract Introspection on the features enabled in htslib, string form - * - * @return a string describing htslib build features - */ -HTSLIB_EXPORT -const char *htslib_feature_string(void); - -// Whether ./configure was used or vanilla Makefile -#define HTS_FEATURE_CONFIGURE 1 - -// Also see htslib_plugin_path function -#define HTS_FEATURE_PLUGINS 2 - -// Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 - -// Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 - -// Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1<<31) - -#endif +#endif // HTSLIB_HTS_OS_H diff --git a/test/test_introspection.c b/test/test_introspection.c index ea90430c4..527a22aa7 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -27,7 +27,6 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/hts.h" #include "../htslib/hfile.h" -#include "../htslib/hts_os.h" int main(void) { printf("Version string: %s\n", hts_version()); From 410f2b2e4174772d504f3d288c085359878a38e9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 16:26:11 +0000 Subject: [PATCH 057/114] Untabify and remove gcc -fpedantic pedantry. --- hfile.c | 2 +- htslib/hfile.h | 4 ++-- test/test_introspection.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/hfile.c b/hfile.c index e5861c25c..01bace09d 100644 --- a/hfile.c +++ b/hfile.c @@ -1219,7 +1219,7 @@ int hts_list_plugins(const char *plist[], int *nplugins) if (*nplugins > np) *nplugins = np; - + return np; } diff --git a/htslib/hfile.h b/htslib/hfile.h index d960ab3db..e0aea9008 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,7 +316,7 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); -/// Fills out sc_list[] with the list of known schemes. +/// Fills out sc_list[] with the list of known schemes. /** * @param plugin [in] Restricts schemes to only those from 'plugin. * @param sc_list [out] Filled out with the scheme names @@ -401,7 +401,7 @@ const char *htslib_feature_string(void); #define HTS_FEATURE_CC (1<<28) #define HTS_FEATURE_CFLAGS (1<<29) #define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1<<31) +#define HTS_FEATURE_CPPFLAGS (1u<<31) #ifdef __cplusplus } diff --git a/test/test_introspection.c b/test/test_introspection.c index 527a22aa7..9ee7d1aa3 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -40,21 +40,21 @@ int main(void) { unsigned int feat = htslib_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) - printf(" HTS_FEATURE_CONFIGURE\n"); + printf(" HTS_FEATURE_CONFIGURE\n"); if (feat & HTS_FEATURE_PLUGINS) - printf(" HTS_FEATURE_PLUGINS\n"); + printf(" HTS_FEATURE_PLUGINS\n"); if (feat & HTS_FEATURE_LIBCURL) - printf(" HTS_FEATURE_LIBCURL\n"); + printf(" HTS_FEATURE_LIBCURL\n"); if (feat & HTS_FEATURE_S3) - printf(" HTS_FEATURE_S3\n"); + printf(" HTS_FEATURE_S3\n"); if (feat & HTS_FEATURE_GCS) - printf(" HTS_FEATURE_GCS\n"); + printf(" HTS_FEATURE_GCS\n"); if (feat & HTS_FEATURE_LIBDEFLATE) - printf(" HTS_FEATURE_LIBDEFLATE\n"); + printf(" HTS_FEATURE_LIBDEFLATE\n"); if (feat & HTS_FEATURE_LZMA) - printf(" HTS_FEATURE_LZMA\n"); + printf(" HTS_FEATURE_LZMA\n"); if (feat & HTS_FEATURE_BZIP2) - printf(" HTS_FEATURE_BZIP2\n"); + printf(" HTS_FEATURE_BZIP2\n"); printf("\nFeature string: %s\n", htslib_feature_string()); From 481bd2226195d3d6f9bc79f9b80d9750b2b43b76 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 15 Jan 2021 21:15:42 +0200 Subject: [PATCH 058/114] Fix iteration over CIGARs with indels in tweak_overlap_quality (PR #1202) Reset icig to -1 at the end of a CIGAR matching segment or in case of indels, clips and skips, so that the next MATCH starts from 0 index. Co-authored-by: Valeriu Ohan Co-authored-by: wulj2 --- sam.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sam.c b/sam.c index dbfc42e56..ea5b968f1 100644 --- a/sam.c +++ b/sam.c @@ -4816,14 +4816,14 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) { - if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; } + if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } (*iseq)++; (*icig)++; (*iref)++; return BAM_CMATCH; } - if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; } - if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } + if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } + if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } hts_log_error("Unexpected cigar %d", cig); return -2; } From 13fe974f75458edd490ed841f60bbcd0cef58ee1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 Jan 2021 11:47:37 +0000 Subject: [PATCH 059/114] Renable changing CRAM compression level via hts_set_opt. This bug was introduced during #1181 where the additional HTS_OPT_FILTER meant the automatic fall-through from HTS_OPT_COMPRESSION_LEVEL no longer applied. I decided against continuing the previous obscurity of moving HTS_OPT_COMPRESSION_LEVEL to the end of the switch and having no return (thus falling into the cram_set_voption call), instead favouring an explicit call. An example command that demonstrated the problem was: samtools view -O cram,level=1 in.bam -o out.cram --- hts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hts.c b/hts.c index 9e50290cf..07c9d604a 100644 --- a/hts.c +++ b/hts.c @@ -1340,6 +1340,8 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; + else if (fp->format.format == cram) + return cram_set_option(fp->fp.cram, opt, level); return 0; } From f39dae751dae8c1abf8477074c5fbaea66d229df Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 13 Jan 2021 17:34:40 +0000 Subject: [PATCH 060/114] Fix incorrect setting of end position in overlap_push() Commit a2fdf3b changed bam_endpos() so that it would consider alignments which consume no bases (e.g. fully soft clipped) as one base long. This had the side effect of making overlap_push() change the position stored in a->end, causing reads that would previously have been ignored to be added to the pileup. As resolve_cigar2() was not designed to handle such reads it resulted in nonsensical values being set in the pileup structures for these reads. The update to a->end in overlap_push() is not necessary anyway as it will have already have been set correctly in bam_plp_push(), so the simplest solution is to remove the call to bam_endpos() in overlap_push(). Pointers to the cigar data in tweak_overlap_quality() have been changed to const to ensure it makes no changes that would change the alignment end position. Fixes samtools/bcftools#1362 (bcftools mpileup seg fault for reads with only 1 match). --- sam.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index ea5b968f1..884eecd6a 100644 --- a/sam.c +++ b/sam.c @@ -4772,7 +4772,11 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_set(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { hts_pos_t pos = *iref; if ( pos < 0 ) return -1; @@ -4807,7 +4811,11 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_next(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4834,8 +4842,8 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { - uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; - uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + const uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; + const uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; hts_pos_t a_icig = 0, a_iseq = 0; hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); @@ -4945,8 +4953,6 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) int err = tweak_overlap_quality(&a->b, &node->b); kh_del(olap_hash, iter->overlaps, kitr); assert(a->end-1 == a->s.end); - a->end = bam_endpos(&a->b); - a->s.end = a->end - 1; return err; } return 0; From 1c930612f2049639cf832526752f80415125930a Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 23 Jan 2021 05:05:50 +0000 Subject: [PATCH 061/114] Update links to curl project website See https://daniel.haxx.se/blog/2020/11/04/the-journey-to-a-curl-domain/ --- INSTALL | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTALL b/INSTALL index 92a358a73..582f9b69e 100644 --- a/INSTALL +++ b/INSTALL @@ -145,7 +145,7 @@ various features and specify further optional external requirements: on to cause additional directories to be searched. --enable-libcurl - Use libcurl () to implement network access to + Use libcurl () to implement network access to remote files via FTP, HTTP, HTTPS, etc. --enable-gcs diff --git a/configure.ac b/configure.ac index f990c370b..6b76c00a1 100644 --- a/configure.ac +++ b/configure.ac @@ -362,7 +362,7 @@ if test "$enable_libcurl" != no; then *) MSG_ERROR([libcurl $message Support for HTTPS and other SSL-based URLs requires routines from the libcurl -library . Building HTSlib with libcurl enabled +library . Building HTSlib with libcurl enabled requires libcurl development files to be installed on the build machine; you may need to ensure a package such as libcurl4-{gnutls,nss,openssl}-dev (on Debian or Ubuntu Linux) or libcurl-devel (on RPM-based Linux distributions From 79d8c948e6aaec343e6f07d9bab9f7edcf1fa368 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 16:42:34 +0000 Subject: [PATCH 062/114] Fix some fuzz issues in CRAM 3.1. Credit to OSS-Fuzz - Check for return value from RANS4x16 and ARITH codecs so a valid stream that is the wrong size still frees up the memory. Fixes oss-fuzz 29786 Fixes oss-fuzz 29787 Fixes oss-fuzz 29796 - Check for return value from TOK3 codec so error is bubbled up Fixes oss-fuzz 29789 --- cram/cram_io.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 4f95f940a..30f109d6e 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1680,8 +1680,12 @@ int cram_uncompress_block(cram_block *b) { case RANS_PR0: { unsigned int usize = b->uncomp_size, usize2; uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); - if (!uncomp || usize != usize2) + if (!uncomp) + return -1; + if (usize != usize2) { + free(uncomp); return -1; + } b->orig_method = RANS_PR0 + (b->data[0]&1) + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); free(b->data); @@ -1696,8 +1700,12 @@ int cram_uncompress_block(cram_block *b) { case ARITH_PR0: { unsigned int usize = b->uncomp_size, usize2; uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); - if (!uncomp || usize != usize2) + if (!uncomp) return -1; + if (usize != usize2) { + free(uncomp); + return -1; + } b->orig_method = ARITH_PR0 + (b->data[0]&1) + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); free(b->data); @@ -1712,6 +1720,8 @@ int cram_uncompress_block(cram_block *b) { case TOK3: { uint32_t out_len; uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + if (!cp) + return -1; b->orig_method = TOK3; b->method = RAW; free(b->data); From f4235784d7feb75ed0a15afa3f624029172c39bb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 17:00:03 +0000 Subject: [PATCH 063/114] Fix memory leak with malformed huffman data. Credit to OSS-Fuzz Fixes oss-fuzz 29780 --- cram/cram_codecs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 783fc393d..2fcce4ed1 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2758,8 +2758,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get32(&cp, data_end, &err); } else { - free(h); - return NULL; + goto malformed; } if (err) @@ -2794,9 +2793,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, if (max_len > max_code_bits) { hts_log_error("Huffman code length (%d) is greater " "than maximum supported (%d)", max_len, max_code_bits); - free(h); - free(codes); - return NULL; + goto malformed; } /* Sort by bit length and then by symbol value */ From 23a67495c5372bacca90a6e1f5ab6b79afc314fc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 17:10:37 +0000 Subject: [PATCH 064/114] Fix missing rANS_word.h dependency. --- htscodecs_bundled.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d270773f7..328a86b0c 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -18,6 +18,7 @@ htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h From 40afaaa5bf7e3f9cd6d6d9d9d8ae26496df82aea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 22 Jan 2021 14:37:37 +0000 Subject: [PATCH 065/114] Fixed some more malformed data bugs, this time XPACK. This is an experimental encoding for CRAM 4.0 draft. We protect against nbits and nval being out of range. Fixes a buffer overrun. Credit to OSS-Fuzz Fixes oss-fuzz 29827 --- cram/cram_codecs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 2fcce4ed1..067319fc6 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1400,7 +1400,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->decode = cram_xpack_decode_char; else { fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); - return NULL; + goto malformed; } c->free = cram_xpack_decode_free; c->size = cram_xpack_decode_size; @@ -1408,6 +1408,9 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); + if (c->u.xpack.nbits >= 8 || c->u.xpack.nbits < 0 || + c->u.xpack.nval > 256 || c->u.xpack.nval < 0) + goto malformed; int i; for (i = 0; i < c->u.xpack.nval; i++) { uint32_t v = vv->varint_get32(&cp, endp, NULL); From a01a139ac1fa365a688ec8734d62ef859a8bc416 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 22 Jan 2021 18:49:40 +0000 Subject: [PATCH 066/114] Pull in htscodecs bug fixes * Fix 1-byte buffer overruns in arith_uncompress_to() and rans_uncompress_to_4x16() * Fixes various cases of undefined behaviour. * Adds a limit on max_names in the name tokeniser to prevent excess memory use. Credit to OSS-Fuzz Fixes oss-fuzz 29785 Fixes oss-fuzz 29789 Fixes oss-fuzz 29792 Fixes oss-fuzz 29795 Fixes oss-fuzz 29802 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index dca826bbc..e8d8597e7 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit dca826bbc598d6dcae450e92e60dcf789454999d +Subproject commit e8d8597e7fa7fd27a4cd5c9ff00ff97c37a8b456 From 90fef7300c75ca6b8834da0dcb710e27d599324e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 25 Jan 2021 09:32:50 +0000 Subject: [PATCH 067/114] More OSS-Fuzz detected fixes Credit to OSS-Fuzz - Clean up properly when XPACK output values are out of the accepted range. Fixes oss-fuzz 29827 - Add "cp_end" arg to varint_get* funcs in cram_varint_decode_init so reading beyond the buffer can be spotted. Fixes oss-fuzz 29837 - Set b->uncomp_size for FQZ decoder. Already done in other codecs, just accidentally missed here. Without it, it invalidates the boundary check in int32_get_blk. Fixes oss-fuzz 29855 Fixes oss-fuzz 29908 - Fixed memory leak in XDELTA when recovering from malformed data stream. Fixes oss-fuzz 29858 --- cram/cram_codecs.c | 13 +++++++------ cram/cram_io.c | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 067319fc6..63c6ea730 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -745,7 +745,7 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; - char *cp = data; + char *cp = data, *cp_end = data+size; if (!(c = malloc(sizeof(*c)))) return NULL; @@ -775,8 +775,8 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->size = cram_varint_decode_size; c->get_block = cram_varint_get_block; - c->u.varint.content_id = vv->varint_get32 (&cp, NULL, NULL); - c->u.varint.offset = vv->varint_get64s(&cp, NULL, NULL); + c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL); + c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); if (cp - data != size) { fprintf(stderr, "Malformed varint header stream\n"); @@ -1414,7 +1414,8 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, int i; for (i = 0; i < c->u.xpack.nval; i++) { uint32_t v = vv->varint_get32(&cp, endp, NULL); - if (v >= 256) return NULL; + if (v >= 256) + goto malformed; c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K } @@ -1714,7 +1715,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, char *cp = data; char *endp = data+size; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XDELTA; @@ -1749,7 +1750,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, if (cp - data != size) { malformed: fprintf(stderr, "Malformed xdelta header stream\n"); - free(c); + cram_xdelta_decode_free(c); return NULL; } diff --git a/cram/cram_io.c b/cram/cram_io.c index 30f109d6e..8d257accc 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1674,6 +1674,7 @@ int cram_uncompress_block(cram_block *b) { b->data = (unsigned char *)uncomp; b->alloc = uncomp_size; b->method = RAW; + b->uncomp_size = uncomp_size; break; } From 10815ba355e0fe386cbe63d72405a40d3675893d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 25 Jan 2021 17:36:10 +0000 Subject: [PATCH 068/114] Spell out `git clone --recurse-submodules` option [minor] Use the current canonical option rather than the --recursive alias; there doesn't appear to have ever been a -r short form for this. --- INSTALL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index 582f9b69e..42bffb85b 100644 --- a/INSTALL +++ b/INSTALL @@ -66,7 +66,7 @@ from a release tar file, you can skip this section. Some parts of HTSlib are provided by the external "htscodecs" project. This is included as a submodule. When building from the git repository, -either clone the project using "git clone -r", or run: +either clone the project using "git clone --recurse-submodules", or run: git submodule update --init --recursive From a64c6c8e4f4056c1f7dca3335d239fa392e2d1b3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 19 Jan 2021 17:17:52 +0000 Subject: [PATCH 069/114] Reject VCF/BCF records with no REF allele The specification says REF must be present, so reject reads that do not have one. Fixes a crash reported in bcf_sr_sort_set() which expects REF to be present. Fixes samtools/bcftools#1361 (bcftools merge segfault when REF and ALT are unset) --- vcf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcf.c b/vcf.c index a1aa55525..e9868cadf 100644 --- a/vcf.c +++ b/vcf.c @@ -1368,6 +1368,12 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { ptr += bytes; // Check REF and ALT + if (rec->n_allele < 1) { + hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele", + bcf_seqname_safe(hdr,rec), rec->pos+1); + err |= BCF_ERR_TAG_UNDEF; + } + reports = 0; for (i = 0; i < rec->n_allele; i++) { if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; From 42b365fb5ed155b963c07eb16b4538b7bd8daf55 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 26 Jan 2021 17:26:47 +0000 Subject: [PATCH 070/114] remove a strange gcc10 warning. Gcc 10 when using -O and -fsanitize=address produces a warning about mallocing some insanely large amount of memory. This is an impossibility given the limits on the sizes of some data values, but gcc doesn't know this and is assuimng variables may wrap around and go negative. Using an unsigned type stops it from exploring the inaccessible data ranges. --- cram/cram_codecs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 63c6ea730..e581ba3cf 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3067,9 +3067,10 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, enum cram_external_type option, void *dat, int version, varint_vec *vv) { - int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; + int *vals = NULL, *freqs = NULL, *lens = NULL, code, len; int *new_vals, *new_freqs; - int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + int i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + size_t nvals, vals_alloc = 0; cram_codec *c; cram_huffman_code *codes; From 9d9e60c83487a8ac4644079e9636e34a9dd4bb08 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jan 2021 11:32:04 +0000 Subject: [PATCH 071/114] Update htscodecs to pull in more bug fixes Fixes various issues found by fuzzing. Credit to OSS-Fuzz - Removed undefined shift (malformed input data) Fixes oss-fuzz 29817 - Fix overflow in filling out sfb[] array Fixes oss-fuzz 29931 - Fix RLE with zero-byte meta-data block Fixes oss-fuzz 29939 - Fix rle_decode memory tidyup Fixes oss-fuzz 29935 - Fix undefined shift in tokenise_name3.c Fixes oss-fuzz 29956 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index e8d8597e7..144bda6ac 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit e8d8597e7fa7fd27a4cd5c9ff00ff97c37a8b456 +Subproject commit 144bda6ac96ea2d0535e7e305077b3fcaea7b5f0 From b8dcbd12b2c535fb792a5dfb275e875b626e929d Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 27 Jan 2021 13:50:04 +0000 Subject: [PATCH 072/114] Added missing quotes. Also converted to using ALL_CPPFLAGS from #1187. --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 21f510579..ad1353f10 100644 --- a/Makefile +++ b/Makefile @@ -341,8 +341,7 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) -hfile.o hfile.pico: TMP_CPPFLAGS := $(CPPFLAGS) -hfile.o hfile.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" +hfile.o hfile.pico: ALL_CPPFLAGS += -DHTS_CPPFLAGS="\"$(CPPFLAGS)\"" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) From 64808c2f898782cc50b690969265f01adeeb978d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jan 2021 17:21:10 +0000 Subject: [PATCH 073/114] Disable travis tests travis.org is closing down, and cirrus-ci works as a replacement. Unfortunately this removes the only big-endian platform (s390x). It will be substituted in the future if an alternative can be found, but meanwhile big-endian tests will need to be run manually. --- .travis.yml | 109 ---------------------------------------------------- README.md | 2 +- 2 files changed, 1 insertion(+), 110 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d85224133..000000000 --- a/.travis.yml +++ /dev/null @@ -1,109 +0,0 @@ -# Control file for continuous integration testing at http://travis-ci.org/ - -language: c - -matrix: - include: - - compiler: gcc - os: linux - env: DO_MAINTAINER_CHECKS=yes USE_CONFIG=no - - - compiler: gcc-8 - os: linux - env: USE_CONFIG=yes CC=gcc-8 AR=gcc-ar-8 - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - # An optimised build with address and leak checking, also using libdeflate - - compiler: gcc-8 - os: linux - dist: xenial - env: USE_CONFIG=yes USE_LIBDEFLATE=yes CC=gcc-8 AR=gcc-ar-8 CFLAGS="-g -Wall -O3 -fsanitize=address" LDFLAGS="-fsanitize=address" - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - - compiler: clang - os: osx - env: USE_CONFIG=no - - - compiler: clang - os: osx - env: USE_CONFIG=yes - - - compiler: clang - os: osx - env: USE_CONFIG=yes USE_LIBDEFLATE=yes - - - compiler: gcc - os: linux - env: USE_CONFIG=yes - - - compiler: clang - os: linux - env: USE_CONFIG=yes - - - compiler: gcc - os: linux - env: CFLAGS="-std=c99 -pedantic" USE_CONFIG=yes - - # Big-endian - - compiler: gcc - arch: s390x - os: linux - env: USE_CONFIG=yes - addons: - apt: - packages: - - libbz2-dev - - liblzma-dev - - -# For MacOSX systems -before_install: - - | - if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then - HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ) - fi - -before_script: - - | - if test "x$USE_LIBDEFLATE" == "xyes"; then - pushd "$HOME" && \ - git clone --depth 1 https://github.com/ebiggers/libdeflate.git && \ - pushd libdeflate && \ - make -j 2 CFLAGS='-fPIC -O3' libdeflate.a && \ - popd && \ - popd - fi - git submodule update --init --recursive - -script: - - | - if test "x$USE_LIBDEFLATE" = "xyes"; then - CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' - else - CONFIG_OPTS='--without-libdeflate' - fi - - | - if test "$USE_CONFIG" = "yes"; then - MAKE_OPTS= ; - autoreconf && \ - eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ - ( cat config.log; false ) - else - MAKE_OPTS=-e - fi && \ - if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then - make maintainer-check - fi && \ - make -j 2 $MAKE_OPTS && \ - make test-shlib-exports && \ - make test diff --git a/README.md b/README.md index 1b01d7271..e28f4a415 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/samtools/htslib.svg?branch=develop)](https://travis-ci.org/samtools/htslib) +[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib) [![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop) [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib) From 8510a0d2a5ac878f1a4b9a6683580bcdfa9196d3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 28 Jan 2021 17:06:25 +0000 Subject: [PATCH 074/114] Fix more OSS fuzz reported issues. Credit to OSS-Fuzz - Add bounds check in cram_const_decode_init. I've checked and this is the only use of the varint decoders that didn't have a bounds check. Suitably sniffed out by the fuzzer. Good fuzzer! Fixes oss-fuzz 30012 - Free the sub-encoding when initialising XRLE encoding fails due to malformed streams. This removes a tiny memory leak. Fixes oss-fuzz 30014 --- cram/cram_codecs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index e581ba3cf..ab46fc293 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -964,7 +964,7 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, c->size = cram_const_decode_size; c->get_block = NULL; - c->u.xconst.val = vv->varint_get64s(&cp, NULL, NULL); + c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL); if (cp - data != size) { fprintf(stderr, "Malformed const header stream\n"); @@ -2118,7 +2118,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, char *endp = data+size; int err = 0; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XRLE; @@ -2175,7 +2175,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, malformed: fprintf(stderr, "Malformed xrle header stream\n"); - free(c); + cram_xrle_decode_free(c); return NULL; } From a6e89c59e976023afb66536e5162a697d298308c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 09:07:25 +0000 Subject: [PATCH 075/114] Pull in htscodecs fuzzing bug fixes Credit to OSS-Fuzz - Fix undefined shifts Fixes oss-fuzz 29995 Fixes oss-fuzz 30017 - Protect against no tokens present in tok_name3 codec Fixes oss-fuzz 30008 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 144bda6ac..cf0e9611d 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 144bda6ac96ea2d0535e7e305077b3fcaea7b5f0 +Subproject commit cf0e9611dc88e351148a19568c858101ed31d0fa From c56cfaa31ba6a15a226ec6e3e0b25eea53b255f8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 1 Feb 2021 09:40:49 +0000 Subject: [PATCH 076/114] Remove small memory leak when decoding malform slide header. This is an old bug, but only recently picked up by the fuzzer. Credit to OSS-Fuzz Fixes oss-fuzz 30105 --- cram/cram_decode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 41203ec7f..26c9c592b 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1032,6 +1032,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { if (!err) return hdr; + free(hdr->block_content_ids); free(hdr); return NULL; } From 53d7277ad2feb71e2e65f6c71107d854e89c7440 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 2 Feb 2021 09:27:50 +0000 Subject: [PATCH 077/114] Remove small memory leak when decoding malformed XPACK encoding. As with recent changes to XPACK and XRLE, it now ensures the sub-codec is also properly freed too. Credit to OSS-Fuzz Fixes oss-fuzz 30164 --- cram/cram_codecs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index ab46fc293..1a357fdbe 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1388,7 +1388,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, char *cp = data; char *endp = data+size; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XPACK; @@ -1433,7 +1433,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) { malformed: fprintf(stderr, "Malformed xpack header stream\n"); - free(c); + cram_xpack_decode_free(c); return NULL; } From 6568f13bf7b36f280a8f0e4777fe05c85a9dcf06 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 09:56:55 +0000 Subject: [PATCH 078/114] Pull in htscodecs changes and update Makefile - Fix an overflow bug in the unstripe function Credit to OSS-Fuzz Fixes oss-fuzz 30087 - Move some duplicated code to a new htscodecs/utils.h header - Updates htslib Makefile infrastructure for the new htscodecs header --- Makefile | 6 +++--- htscodecs | 2 +- htscodecs_bundled.mk | 8 ++++++-- htscodecs_external.mk | 5 ++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index ad1353f10..ef65eb68d 100644 --- a/Makefile +++ b/Makefile @@ -380,11 +380,11 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) -htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htsodecs_utils_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) -htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) diff --git a/htscodecs b/htscodecs index cf0e9611d..0ffb50be0 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit cf0e9611dc88e351148a19568c858101ed31d0fa +Subproject commit 0ffb50be07eab4ff40c8e6b7dc346266c6a23421 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 328a86b0c..8b41f76b4 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -8,6 +8,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) +# htscodecs public headers htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h htscodecs_pack_h = htscodecs/htscodecs/pack.h @@ -17,11 +18,14 @@ htscodecs_rle_h = htscodecs/htscodecs/rle.h htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h -htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h -htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h +# htscodecs internal headers +htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) +htscodecs_utils_h = htscodecs/htscodecs/utils.h # Add htscodecs tests into the HTSlib test framework diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 7ee47402c..7ac6944e9 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -11,7 +11,10 @@ htscodecs_rle_h = htscodecs_tokenise_name3_h = htscodecs_varint_h = -htscodecs_rANS_byte_h = +htscodecs_htscodecs_endian_h = htscodecs_c_range_coder_h = htscodecs_c_simple_model_h = htscodecs_pooled_alloc_h = +htscodecs_rANS_byte_h = +htscodecs_rANS_word_h = +htscodecs_utils_h = From 6ad5626c8e01e4eac71360dd9303d33f4c6267d2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 11:40:46 +0000 Subject: [PATCH 079/114] Add / update copyright boilerplate --- Makefile | 2 +- htscodecs_bundled.mk | 25 +++++++++++++++++++++++++ htscodecs_external.mk | 24 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ef65eb68d..0f2fddb47 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2020 Genome Research Ltd. +# Copyright (C) 2013-2021 Genome Research Ltd. # # Author: John Marshall # diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 8b41f76b4..de4d5db8d 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,3 +1,28 @@ +# Makefile fragment to add settings needed when bundling htscodecs functions +# +# Copyright (C) 2021 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 7ac6944e9..ce24dd6f5 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,3 +1,27 @@ +# Makefile fragment for use when linking to an external libhtscodecs +# +# Copyright (C) 2021 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + HTSCODECS_SOURCES = HTSCODECS_OBJS = HTSCODECS_TEST_TARGETS = From 2d1e9549f45881a014a387dd2c3287508a863ff9 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 3 Feb 2021 10:22:26 +0000 Subject: [PATCH 080/114] Updated coyright messages prior to release. --- bgzip.c | 2 +- configure.ac | 4 ++-- cram/cram_codecs.c | 2 +- cram/cram_external.c | 2 +- cram/cram_io.c | 2 +- cram/cram_samtools.h | 2 +- hfile.c | 2 +- hts_os.c | 2 +- htsfile.c | 2 +- htslib/cram.h | 2 +- htslib/hfile.h | 2 +- htslib/hts_os.h | 2 +- htslib/knetfile.h | 2 +- htslib/synced_bcf_reader.h | 2 +- htslib_vars.mk | 2 +- kfunc.c | 2 +- probaln.c | 2 +- sam.c | 2 +- tabix.c | 2 +- test/fuzz/hts_open_fuzzer.c | 2 +- test/test.pl | 2 +- test/test_view.c | 2 +- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/bgzip.c b/bgzip.c index 097e3f80c..22eb0404a 100644 --- a/bgzip.c +++ b/bgzip.c @@ -171,7 +171,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); diff --git a/configure.ac b/configure.ac index 6b76c00a1..d79947ae4 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2020 Genome Research Ltd. +# Copyright (C) 2015-2021 Genome Research Ltd. # # Author: John Marshall # @@ -34,7 +34,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2020 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2021 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 1a357fdbe..55030c814 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2021 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_external.c b/cram/cram_external.c index 88175103d..314826932 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2019 Genome Research Ltd. +Copyright (c) 2015, 2018-2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_io.c b/cram/cram_io.c index 8d257accc..3a6b04c42 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2021 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 34c1db40e..a4c9bf5cc 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2010-2013, 2018 Genome Research Ltd. +Copyright (c) 2010-2013, 2018, 2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/hfile.c b/hfile.c index 01bace09d..dba8b6f6c 100644 --- a/hfile.c +++ b/hfile.c @@ -1,6 +1,6 @@ /* hfile.c -- buffered low-level input/output streams. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: John Marshall diff --git a/hts_os.c b/hts_os.c index c26700975..b391a41e0 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htsfile.c b/htsfile.c index e37eb7215..d6d6b4e69 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/htslib/cram.h b/htslib/cram.h index 607cf8bcf..dab666345 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/hfile.h b/htslib/hfile.h index e0aea9008..829e3b12b 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2020 Genome Research Ltd. Author: John Marshall diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 3461df512..b71cb89e7 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -1,7 +1,7 @@ /// @file hts_os.h /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 598ed379a..da9cdc5e8 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2012, 2014 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014, 2021 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 8d3554445..76f889b30 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2020 Genome Research Ltd. Author: Petr Danecek diff --git a/htslib_vars.mk b/htslib_vars.mk index 85835f872..1f4c0905a 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -1,6 +1,6 @@ # Makefile variables useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017, 2019 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019-2020 Genome Research Ltd. # # Author: John Marshall # diff --git a/kfunc.c b/kfunc.c index b7d7d521a..bf15cdf33 100644 --- a/kfunc.c +++ b/kfunc.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2010, 2013-2014 Genome Research Ltd. + Copyright (C) 2010, 2013-2014, 2020 Genome Research Ltd. Copyright (C) 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/probaln.c b/probaln.c index 73df1a8f1..192f4b751 100644 --- a/probaln.c +++ b/probaln.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li - Copyright (C) 2016-2017 Genome Research Ltd. + Copyright (C) 2016-2017, 2020 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/sam.c b/sam.c index 884eecd6a..8bda92384 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2020 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li diff --git a/tabix.c b/tabix.c index 36c414798..1471b09c5 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); diff --git a/test/fuzz/hts_open_fuzzer.c b/test/fuzz/hts_open_fuzzer.c index 5c239d4f1..355f790a5 100644 --- a/test/fuzz/hts_open_fuzzer.c +++ b/test/fuzz/hts_open_fuzzer.c @@ -1,7 +1,7 @@ /* test/fuzz/hts_open_fuzzer.c -- Fuzz driver for hts_open. Copyright (C) 2018 Google LLC. - Copyright (C) 2019 Genome Research Ltd. + Copyright (C) 2019-2020 Genome Research Ltd. Author: Markus Kusano diff --git a/test/test.pl b/test/test.pl index 8def186d9..a33ead381 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2019 Genome Research Ltd. +# Copyright (C) 2012-2020 Genome Research Ltd. # # Author: Petr Danecek # diff --git a/test/test_view.c b/test/test_view.c index 30e604610..dc644610b 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -1,7 +1,7 @@ /* test/test_view.c -- simple view tool, purely for use in a test harness. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2020 Genome Research Ltd. Author: Heng Li From ab75b9ed97e397ec351a769a2a6d303fce357386 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 21 Jan 2021 18:10:43 +0000 Subject: [PATCH 081/114] Move non-plugin introspection API to htslib/hts.h, hts.c It'll be convenient for API users to find the HTS_FEATURE_* functionality in htslib/hts.h alongside hts_version() etc. Fix the missing quote marks for -DHTS_CPPFLAGS by recoding to implement this via a new generated config_vars.h header instead. Similarly to the version.h and config.h rules, this avoids the fragile quoting needed to add possibly-whitespace-containing string values via -D options. Ignore config_vars.h and some recently-added test executables. --- .gitignore | 3 ++ Makefile | 12 +++-- hfile.c | 140 ------------------------------------------------- hts.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++ htslib/hfile.h | 39 -------------- htslib/hts.h | 40 ++++++++++++++ 6 files changed, 191 insertions(+), 182 deletions(-) diff --git a/.gitignore b/.gitignore index 0855b6a88..16a6b288e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.dll.a *.pc.tmp *-uninstalled.pc +config_vars.h /version.h autom4te.cache @@ -55,7 +56,9 @@ shlib-exports-*.txt /test/test-bcf-sr /test/test-bcf-translate /test/test_bgzf +/test/test_expr /test/test_index +/test/test_introspection /test/test_kfunc /test/test_kstring /test/test-parse-reg diff --git a/Makefile b/Makefile index 0f2fddb47..ce256a629 100644 --- a/Makefile +++ b/Makefile @@ -141,6 +141,13 @@ show-version: @echo PACKAGE_VERSION = $(PACKAGE_VERSION) @echo NUMERIC_VERSION = $(NUMERIC_VERSION) +config_vars.h: + echo '#define HTS_CC "$(CC)"' > $@ + echo '#define HTS_CPPFLAGS "$(CPPFLAGS)"' >> $@ + echo '#define HTS_CFLAGS "$(CFLAGS)"' >> $@ + echo '#define HTS_LDFLAGS "$(LDFLAGS)"' >> $@ + echo '#define HTS_LIBS "$(LIBS)"' >> $@ + .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: @@ -341,12 +348,11 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) -hfile.o hfile.pico: ALL_CPPFLAGS += -DHTS_CPPFLAGS="\"$(CPPFLAGS)\"" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) @@ -694,7 +700,7 @@ testclean: -rm -rf htscodecs/tests/test.out mostlyclean: testclean - -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM config_vars.h version.h -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files -rm -f htscodecs/tests/*.o diff --git a/hfile.c b/hfile.c index dba8b6f6c..2799c89dc 100644 --- a/hfile.c +++ b/hfile.c @@ -1250,146 +1250,6 @@ int htslib_has_plugin(const char *name) return 0; } -HTSLIB_EXPORT -unsigned int htslib_features(void) { - unsigned int feat = 0; - -#ifdef PACKAGE_URL - feat |= HTS_FEATURE_CONFIGURE; -#endif - -#ifdef ENABLE_PLUGINS - feat |= HTS_FEATURE_PLUGINS; -#endif - -#ifdef HAVE_LIBCURL - feat |= HTS_FEATURE_LIBCURL; -#endif - -#ifdef ENABLE_S3 - feat |= HTS_FEATURE_S3; -#endif - -#ifdef ENABLE_GCS - feat |= HTS_FEATURE_GCS; -#endif - -#ifdef HAVE_LIBDEFLATE - feat |= HTS_FEATURE_LIBDEFLATE; -#endif - -#ifdef HAVE_LIBLZMA - feat |= HTS_FEATURE_LZMA; -#endif - -#ifdef HAVE_LIBBZ2 - feat |= HTS_FEATURE_BZIP2; -#endif - - return feat; -} - -HTSLIB_EXPORT -const char *htslib_test_feature(int id) { - int feat = htslib_features(); - - switch (id) { - case HTS_FEATURE_CONFIGURE: - return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; - case HTS_FEATURE_PLUGINS: - return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; - case HTS_FEATURE_LIBCURL: - return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; - case HTS_FEATURE_S3: - return feat & HTS_FEATURE_S3 ? "yes" : NULL; - case HTS_FEATURE_GCS: - return feat & HTS_FEATURE_GCS ? "yes" : NULL; - case HTS_FEATURE_LIBDEFLATE: - return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; - case HTS_FEATURE_BZIP2: - return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; - case HTS_FEATURE_LZMA: - return feat & HTS_FEATURE_LZMA ? "yes" : NULL; - - case HTS_FEATURE_CC: - return HTS_CC; - case HTS_FEATURE_CFLAGS: - return HTS_CFLAGS; - case HTS_FEATURE_LDFLAGS: - return HTS_LDFLAGS; - case HTS_FEATURE_CPPFLAGS: - return HTS_CPPFLAGS; - - default: - fprintf(stderr, "Unknown feature code: %d\n", id); - } - - return NULL; -} - -// Note this implementation also means we can just "strings" the library -// to find the configuration parameters. -HTSLIB_EXPORT -const char *htslib_feature_string(void) { - const char *fmt= - -#ifdef PACKAGE_URL - "build=configure " -#else - "build=Makefile " -#endif - -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - -#ifdef HAVE_LIBCURL - "libcurl=yes " -#else - "libcurl=no " -#endif - -#ifdef ENABLE_S3 - "S3=yes " -#else - "S3=no " -#endif - -#ifdef ENABLE_GCS - "GCS=yes " -#else - "GCS=no " -#endif - -#ifdef HAVE_LIBDEFLATE - "libdeflate=yes " -#else - "libdeflate=no " -#endif - -#ifdef HAVE_LIBLZMA - "lzma=yes " -#else - "lzma=no " -#endif - -#ifdef HAVE_LIBBZ2 - "bzip2=yes "; -#else - "bzip2=no "; -#endif - -#ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); - return config; -#else - return fmt; -#endif -} - /*************************** * hFILE interface proper ***************************/ diff --git a/hts.c b/hts.c index a59431f56..cb0a07cfb 100644 --- a/hts.c +++ b/hts.c @@ -47,6 +47,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "version.h" +#include "config_vars.h" #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" @@ -72,6 +73,144 @@ const char *hts_version() return HTS_VERSION_TEXT; } +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + + HTSLIB_EXPORT const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, diff --git a/htslib/hfile.h b/htslib/hfile.h index 829e3b12b..0a54a7d8f 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -364,45 +364,6 @@ int hts_list_plugins(const char *plist[], int *nplugins); HTSLIB_EXPORT int htslib_has_plugin(const char *name); -/*! @abstract Introspection on the features enabled in htslib - * - * @return a bitfield of HTS_FEATURE_* macros. - */ -HTSLIB_EXPORT -unsigned int htslib_features(void); - -HTSLIB_EXPORT -const char *htslib_test_feature(int id); - -/*! @abstract Introspection on the features enabled in htslib, string form - * - * @return a string describing htslib build features - */ -HTSLIB_EXPORT -const char *htslib_feature_string(void); - -// Whether ./configure was used or vanilla Makefile -#define HTS_FEATURE_CONFIGURE 1 - -// Also see htslib_plugin_path function -#define HTS_FEATURE_PLUGINS 2 - -// Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 - -// Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 - -// Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1u<<31) - #ifdef __cplusplus } #endif diff --git a/htslib/hts.h b/htslib/hts.h index 0f2fdd021..65315d16d 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -451,6 +451,46 @@ const char *hts_version(void); // further when significant features are merged. #define HTS_VERSION 101190 +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1u<<31) + + /*! @abstract Determine format by peeking at the start of a file @param fp File opened for reading, positioned at the beginning From 89db11228c6ae6503ff75f930d9b3fda86fc34d1 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 21 Jan 2021 18:34:12 +0000 Subject: [PATCH 082/114] Renumber introspection feature codes (and they are unsigned) Leave gaps in the allocated bits between the different feature categories. Leave a gap at the end for HTS_FEATURE_LIBS if it is added in future. Don't mention htslib_plugin_path() in the public header as it is an internal function. Change htslib_test_feature() to use unsigned int, as per htslib_features(). --- hts.c | 6 +++--- htslib/hts.h | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/hts.c b/hts.c index cb0a07cfb..68b166892 100644 --- a/hts.c +++ b/hts.c @@ -111,8 +111,8 @@ unsigned int htslib_features(void) { return feat; } -const char *htslib_test_feature(int id) { - int feat = htslib_features(); +const char *htslib_test_feature(unsigned int id) { + unsigned int feat = htslib_features(); switch (id) { case HTS_FEATURE_CONFIGURE: @@ -142,7 +142,7 @@ const char *htslib_test_feature(int id) { return HTS_CPPFLAGS; default: - fprintf(stderr, "Unknown feature code: %d\n", id); + fprintf(stderr, "Unknown feature code: %u\n", id); } return NULL; diff --git a/htslib/hts.h b/htslib/hts.h index 65315d16d..6337306b8 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -459,7 +459,7 @@ HTSLIB_EXPORT unsigned int htslib_features(void); HTSLIB_EXPORT -const char *htslib_test_feature(int id); +const char *htslib_test_feature(unsigned int id); /*! @abstract Introspection on the features enabled in htslib, string form * @@ -471,24 +471,24 @@ const char *htslib_feature_string(void); // Whether ./configure was used or vanilla Makefile #define HTS_FEATURE_CONFIGURE 1 -// Also see htslib_plugin_path function +// Whether --enable-plugins was used #define HTS_FEATURE_PLUGINS 2 // Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 +#define HTS_FEATURE_LIBCURL (1u<<10) +#define HTS_FEATURE_S3 (1u<<11) +#define HTS_FEATURE_GCS (1u<<12) // Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 +#define HTS_FEATURE_LIBDEFLATE (1u<<20) +#define HTS_FEATURE_LZMA (1u<<21) +#define HTS_FEATURE_BZIP2 (1u<<22) // Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1u<<31) +#define HTS_FEATURE_CC (1u<<27) +#define HTS_FEATURE_CFLAGS (1u<<28) +#define HTS_FEATURE_CPPFLAGS (1u<<29) +#define HTS_FEATURE_LDFLAGS (1u<<30) /*! From 1683c53aece32424248f2a23074aee90ab66f911 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 3 Feb 2021 11:32:44 +0000 Subject: [PATCH 083/114] Rename hFILE plugin introspection functions to hfile_* The hFILE plugin mechanism is the only current plugin endpoint in HTSlib, but the facility is general so that there could be other plugin endpoints added in future, e.g., for other compression methods. Rename these introspection functions as they pertain to hFILE plugins only. --- hfile.c | 12 ++++++------ htslib/hfile.h | 12 ++++++------ test/test_introspection.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/hfile.c b/hfile.c index 2799c89dc..708957863 100644 --- a/hfile.c +++ b/hfile.c @@ -1148,7 +1148,7 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) ***************************/ /* - * Fills out sc_list[] with the list of known schemes. + * Fills out sc_list[] with the list of known URL schemes. * This can be restricted to just ones from a specific plugin, * or all (plugin == NULL). * @@ -1156,7 +1156,7 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) * -1 on failure. */ HTSLIB_EXPORT -int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { @@ -1189,13 +1189,13 @@ int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) /* - * Fills out plist[] with the list of known plugins. + * Fills out plist[] with the list of known hFILE plugins. * * Returns number of schemes found on success; * -1 on failure */ HTSLIB_EXPORT -int hts_list_plugins(const char *plist[], int *nplugins) +int hfile_list_plugins(const char *plist[], int *nplugins) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { @@ -1225,13 +1225,13 @@ int hts_list_plugins(const char *plist[], int *nplugins) /* - * Tests for the presence of a specific plugin. + * Tests for the presence of a specific hFILE plugin. * * Returns 1 if true * 0 otherwise */ HTSLIB_EXPORT -int htslib_has_plugin(const char *name) +int hfile_has_plugin(const char *name) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { diff --git a/htslib/hfile.h b/htslib/hfile.h index 0a54a7d8f..55cae244e 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,7 +316,7 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); -/// Fills out sc_list[] with the list of known schemes. +/// Fills out sc_list[] with the list of known URL schemes. /** * @param plugin [in] Restricts schemes to only those from 'plugin. * @param sc_list [out] Filled out with the scheme names @@ -335,9 +335,9 @@ char *hfile_mem_steal_buffer(hFILE *file, size_t *length); * -1 on failure */ HTSLIB_EXPORT -int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); -/// Fills out plist[] with the list of known plugins. +/// Fills out plist[] with the list of known hFILE plugins. /* * @param plist [out] Filled out with the plugin names * @param nplugins [in/out] Size of plist (in) and number returned (out) @@ -353,16 +353,16 @@ int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); * -1 on failure */ HTSLIB_EXPORT -int hts_list_plugins(const char *plist[], int *nplugins); +int hfile_list_plugins(const char *plist[], int *nplugins); -/// Tests for the presence of a specific plugin. +/// Tests for the presence of a specific hFILE plugin. /* * @param name The name of the plugin to query. * * @return 1 if found, 0 otherwise. */ HTSLIB_EXPORT -int htslib_has_plugin(const char *name); +int hfile_has_plugin(const char *name); #ifdef __cplusplus } diff --git a/test/test_introspection.c b/test/test_introspection.c index 9ee7d1aa3..0958a962f 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -64,13 +64,13 @@ int main(void) { const char *plugins[100]; int np = 100, i, j; - if (hts_list_plugins(plugins, &np) < 0) + if (hfile_list_plugins(plugins, &np) < 0) return 1; for (i = 0; i < np; i++) { const char *sc_list[100]; int nschemes = 100; - if (hts_list_schemes(plugins[i], sc_list, &nschemes) < 0) + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) return 1; printf(" %s:\n", plugins[i]); From d9890a964368e2c300ae3d4258603df3fe534256 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Feb 2021 16:33:40 +0000 Subject: [PATCH 084/114] Rename htslib_* introspection functions to hts_*. This maintains consistency with the existing hts_version() function. Note this also renames the internal htslib_plugin_path function too, again simply for consistency. Not it queries the HTS_PATH and not HTSLIB_PATH environment variable, so the new name is better. --- hts.c | 10 +++++----- hts_internal.h | 2 +- htslib/hts.h | 6 +++--- plugin.c | 2 +- test/test_introspection.c | 12 ++++++------ 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/hts.c b/hts.c index 68b166892..57ee4d1d6 100644 --- a/hts.c +++ b/hts.c @@ -73,7 +73,7 @@ const char *hts_version() return HTS_VERSION_TEXT; } -unsigned int htslib_features(void) { +unsigned int hts_features(void) { unsigned int feat = 0; #ifdef PACKAGE_URL @@ -111,8 +111,8 @@ unsigned int htslib_features(void) { return feat; } -const char *htslib_test_feature(unsigned int id) { - unsigned int feat = htslib_features(); +const char *hts_test_feature(unsigned int id) { + unsigned int feat = hts_features(); switch (id) { case HTS_FEATURE_CONFIGURE: @@ -150,7 +150,7 @@ const char *htslib_test_feature(unsigned int id) { // Note this implementation also means we can just "strings" the library // to find the configuration parameters. -const char *htslib_feature_string(void) { +const char *hts_feature_string(void) { const char *fmt= #ifdef PACKAGE_URL @@ -203,7 +203,7 @@ const char *htslib_feature_string(void) { #ifdef ENABLE_PLUGINS static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); + sprintf(config, fmt, hts_plugin_path()); return config; #else return fmt; diff --git a/hts_internal.h b/hts_internal.h index 5ee88b7d0..602348618 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -104,7 +104,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * void *plugin_sym(void *plugin, const char *name, const char **errmsg); plugin_void_func *plugin_func(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); -const char *htslib_plugin_path(void); +const char *hts_plugin_path(void); /* * Buffers up arguments to hts_idx_push for later use, once we've written all bar diff --git a/htslib/hts.h b/htslib/hts.h index 6337306b8..9488a9f9b 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -456,17 +456,17 @@ const char *hts_version(void); * @return a bitfield of HTS_FEATURE_* macros. */ HTSLIB_EXPORT -unsigned int htslib_features(void); +unsigned int hts_features(void); HTSLIB_EXPORT -const char *htslib_test_feature(unsigned int id); +const char *hts_test_feature(unsigned int id); /*! @abstract Introspection on the features enabled in htslib, string form * * @return a string describing htslib build features */ HTSLIB_EXPORT -const char *htslib_feature_string(void); +const char *hts_feature_string(void); // Whether ./configure was used or vanilla Makefile #define HTS_FEATURE_CONFIGURE 1 diff --git a/plugin.c b/plugin.c index fccc83793..cec5beefd 100644 --- a/plugin.c +++ b/plugin.c @@ -192,7 +192,7 @@ void close_plugin(void *plugin) } } -const char *htslib_plugin_path(void) { +const char *hts_plugin_path(void) { #ifdef ENABLE_PLUGINS char *path = getenv("HTS_PATH"); if (!path) path = ""; diff --git a/test/test_introspection.c b/test/test_introspection.c index 0958a962f..cc8ceb6e7 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -32,12 +32,12 @@ int main(void) { printf("Version string: %s\n", hts_version()); printf("Version number: %d\n", HTS_VERSION); - printf("\nCC: %s\n", htslib_test_feature(HTS_FEATURE_CC)); - printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); - printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); - printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); + printf("\nCC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + printf("CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + printf("CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + printf("LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); - unsigned int feat = htslib_features(); + unsigned int feat = hts_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) printf(" HTS_FEATURE_CONFIGURE\n"); @@ -56,7 +56,7 @@ int main(void) { if (feat & HTS_FEATURE_BZIP2) printf(" HTS_FEATURE_BZIP2\n"); - printf("\nFeature string: %s\n", htslib_feature_string()); + printf("\nFeature string: %s\n", hts_feature_string()); // Plugins and schemes From 8aa9bcd10c782c8a03adde978f9ef169ebaf4573 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 4 Feb 2021 11:58:43 +0000 Subject: [PATCH 085/114] Fix a minor memory leak in XRLE encoding. Credit to OSS-Fuzz Fixes oss-fuzz 30260 --- cram/cram_codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 55030c814..6a4ee7821 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2130,6 +2130,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, c->decode = cram_xrle_decode_char; else { fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); + free(c); return NULL; } c->free = cram_xrle_decode_free; From dbac2d17ca4d2c8dc0df770e4b168c2c768548b7 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 9 Feb 2021 13:12:37 +0000 Subject: [PATCH 086/114] Fix htscodecs dependency typos --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ce256a629..7b162f196 100644 --- a/Makefile +++ b/Makefile @@ -386,8 +386,8 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) -htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htsodecs_utils_h) $(htscodecs_c_simple_model.h) -htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) From 222387d9d4ed776cf16d50f32a1170ec659c2f29 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 17:58:06 +0000 Subject: [PATCH 087/114] Fix undefined behaviour warning in kstring Fixes a "runtime error: applying zero offset to null pointer" warning from clang 10.0.1 undefined behaviour sanitizer. This happened when the storage for the string had not been allocated before calling kvsprintf(). Fix by making it allocate a buffer if this is the case, so vsnprintf() has something to write into. --- kstring.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kstring.c b/kstring.c index 55626745d..9b2d60c1f 100644 --- a/kstring.c +++ b/kstring.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2018, 2020 Genome Research Ltd. + Copyright (C) 2013-2018, 2020-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -153,6 +153,15 @@ int kvsprintf(kstring_t *s, const char *fmt, va_list ap) return l; } + if (!s->s) { + const size_t sz = 64; + s->s = malloc(sz); + if (!s->s) + return -1; + s->m = sz; + s->l = 0; + } + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. va_end(args); if (l + 1 > s->m - s->l) { From d40cad32b85186e7d760707c61984d3b6fc422b5 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:11:31 +0000 Subject: [PATCH 088/114] Fix CRAM undefined behaviour warnings Fixes various warnings from clang 10.0.1 undefined behaviour sanitizer: - A "runtime error: applying zero offset to null pointer" warning in cram_to_bam() on CRAM slices with no CIGAR data. Fix by always allocating memory when creating a slice data structure. For aligned data this simply moves the allocation forward in time. For unaligned data, the allocated memory is not really needed, but it's assumed that it is quicker to make one allocation per slice than to check for a NULL pointer on every call to cram_to_bam(). - A "runtime error: applying zero offset to null pointer" warning in cram_byte_array_stop_decode_block() when it is called before the `out` block has been allocated. Fix by moving the calculation of `out_cp`. Also changes the type of some variables to `unsigned char` enabling removal of a number of (char *) casts. - An integer overflow in the cram_metrics::sz array. Entries 9 and 10 had a high multiplier to ensure they would never win, as they correspond to reserved method codes. Under certain conditions the value in them could build up to the point where they overflowed. The methods are masked out elsewhere , so the numbers would not have been used, but it did cause an overflow warning. Fix by changing the multiplier to 1, and (in a belt-and-braces approach) setting the values for these entries to INT_MAX to ensure they won't be chosen. --- cram/cram_codecs.c | 18 +++++++++--------- cram/cram_io.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 6a4ee7821..0c6cb654e 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3494,8 +3494,8 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, int *out_size) { cram_block *b; cram_block *out = (cram_block *)out_; - char *cp, *out_cp, *cp_end; - char stop; + unsigned char *cp, *cp_end; + unsigned char stop; b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); if (!b) @@ -3503,25 +3503,25 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, if (b->idx >= b->uncomp_size) return -1; - cp = (char *)b->data + b->idx; - cp_end = (char *)b->data + b->uncomp_size; - out_cp = (char *)BLOCK_END(out); + cp = b->data + b->idx; + cp_end = b->data + b->uncomp_size; stop = c->u.byte_array_stop.stop; if (cp_end - cp < out->alloc - out->byte) { + unsigned char *out_cp = BLOCK_END(out); while (cp != cp_end && *cp != stop) *out_cp++ = *cp++; - BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out); } else { - char *cp_start; + unsigned char *cp_start; for (cp_start = cp; cp != cp_end && *cp != stop; cp++) ; BLOCK_APPEND(out, cp_start, cp - cp_start); BLOCK_GROW(out, cp - cp_start); } - *out_size = cp - (char *)(b->data + b->idx); - b->idx = cp - (char *)b->data + 1; + *out_size = cp - (b->data + b->idx); + b->idx = cp - b->data + 1; return 0; diff --git a/cram/cram_io.c b/cram/cram_io.c index 3a6b04c42..8099a9287 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2080,7 +2080,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, 1.04, // 6 arithpr (O0) 1.05, // 7 fqz 1.05, // 8 tok3 (rans) - 9, 9, // 9,10 reserved + 1.00, 1.00, // 9,10 reserved // Paramterised versions of above 1.01, // gzip rle @@ -2124,6 +2124,9 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, metrics->sz[m] *= 1+(meth_cost[m]-1)/3; } // else cost is ignored + // Ensure these are never used + metrics->sz[9] = metrics->sz[10] = INT_MAX; + for (m = 0; m < CRAM_MAX_METHOD; m++) { if ((!metrics->sz[m]) || (!(method & (1u<block_by_id = NULL; s->last_apos = 0; if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; @@ -4499,8 +4502,8 @@ cram_slice *cram_read_slice(cram_fd *fd) { } /* Initialise encoding/decoding tables */ - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; From 1b66f6e2d10504856a13b5cc2f0d250a410298fd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:34:36 +0000 Subject: [PATCH 089/114] Fix undefined behaviour warning in test-vcf-sweep If the input file had no PL entry in the header, bcf_get_format_int32() returns -1. Not checking for return values < 0 led to the loop over the samples running, and an attempt to do arithmetic on a NULL pointer (ptr). Fix by changing the condition on the nPLs test. --- test/test-vcf-sweep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-vcf-sweep.c b/test/test-vcf-sweep.c index 4b8e3d70b..40ee4e417 100644 --- a/test/test-vcf-sweep.c +++ b/test/test-vcf-sweep.c @@ -63,7 +63,7 @@ int main(int argc, char **argv) { // get copy of the PL vectors nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; // PL not present + if ( nPLs <= 0 ) continue; // PL not present // how many values are there per sample int nvals = nPLs / bcf_hdr_nsamples(hdr); @@ -91,7 +91,7 @@ int main(int argc, char **argv) while ( (rec = bcf_sweep_bwd(sw)) ) { nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; + if ( nPLs <= 0 ) continue; int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i Date: Tue, 2 Feb 2021 16:35:15 +0000 Subject: [PATCH 090/114] Fix undefined behaviour warning in bcf_record_check() Fixes a "runtime error: applying zero offset to null pointer" warning from clang 10.0.1 undefined behaviour sanitizer on lines with no FORMAT values. This happened when bcf_record_check() tried to calculate the end of the (NULL) rec->indiv.s buffer. Instead of trying to handle this in bcf_record_check(), fix by ensuring bcf_read1_core() allocates at least one byte for `indiv`, and good measure do the same for `shared` as well. As well as fixing the warning in bcf_record_check(), this prevents NULL pointers from being passed to other functions, for example bgzf_read(). --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index e9868cadf..a0c440202 100644 --- a/vcf.c +++ b/vcf.c @@ -1236,9 +1236,9 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) shared_len = le_to_u32(x); if (shared_len < 24) return -2; shared_len -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, shared_len) != 0) return -2; + if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; indiv_len = le_to_u32(x + 4); - if (ks_resize(&v->indiv, indiv_len) != 0) return -2; + if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); v->rlen = le_to_i32(x + 16); From 9f8ac0365e455676af096ec81f83cf01dfb309da Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 17:29:11 +0000 Subject: [PATCH 091/114] Fix truncation bug in filtering bitwise expressions When setting `res->is_true` on the result of bitwise expressions, only the lowest eight bits were used which meant it could be set incorrectly depending on the result. Fix by explicitly checking the result against zero. --- hts_expr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index 1a1b9c52d..599d7a54a 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -1,6 +1,6 @@ /* hts_expr.c -- filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield @@ -413,7 +413,7 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; } else { break; } @@ -441,7 +441,7 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; } else { break; } @@ -469,7 +469,7 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; } else { break; } From 7152cb40f1b5404ff9a0523aa3d4b6c060b1ac51 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:41:11 +0000 Subject: [PATCH 092/114] Fix undefined behaviour in bcf_update_alleles() bcf_update_alleles() checked to see if any of the allele pointers passed in were pointing to the existing allele data, so it could avoid clobbering them during the update. This caused a "runtime error: applying zero offset to null pointer" from undefined behaviour sanitizer when it was called with a new BCF structure where line->d.als had not yet been allocated. However, even when it had been allocated, the test relied on undefined behaviour as it compared pointers to different memory regions. Fix by rewriting the function so that it always copies the input data to a new location. Where the new alleles are short enough (which should be the usual case) and can fit in the existing allocation, they are first copied into a stack buffer, and then back to line->d.als; otherwise they go into a newly allocated buffer which is switched for the original. This avoids the undefined behaviour issues, and also allows the line->d.als memory to be reused even when some of the input alleles point into it. Includes unit tests. --- test/test-vcf-api.c | 63 ++++++++++++++++++++++++++++++++++++++-- vcf.c | 70 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 114 insertions(+), 19 deletions(-) diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 87bce4aab..22886e7af 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -46,6 +46,65 @@ void error(const char *format, ...) #define STRINGIFY(x) #x #define check0(x) ((x) == 0 ? (void) 0 : error("Failed: %s", STRINGIFY(x))) +static int check_alleles(bcf1_t *rec, const char **alleles, int num) { + int i; + if (rec->n_allele != num) { + fprintf(stderr, "Wrong number of alleles - expected %d, got %d\n", + num, rec->n_allele); + return -1; + } + if (bcf_unpack(rec, BCF_UN_STR) != 0) + return -1; + for (i = 0; i < num; i++) { + if (0 != strcmp(alleles[i], rec->d.allele[i])) { + fprintf(stderr, + "Mismatch for allele %d : expected '%s' got '%s'\n", + i, alleles[i], rec->d.allele[i]); + return -1; + } + } + return 0; +} + +static void test_update_alleles(bcf_hdr_t *hdr, bcf1_t *rec) +{ + // Exercise bcf_update_alleles() a bit + const char *alleles1[2] = { "G", "A" }; + const char *alleles2[3] = { "C", "TGCA", "CATG" }; +#define rep10(x) x x x x x x x x x x + const char *alleles3[3] = { rep10("ATTCTAGATC"), "TGCA", + rep10("CTATTATCTCTAATGACATG") }; +#undef rep10 + const char *alleles4[3] = { alleles3[2], NULL, alleles3[0] }; + // Add some alleles + check0(bcf_update_alleles(hdr, rec, alleles1, 2)); + check0(check_alleles(rec, alleles1, 2)); + // Erase them + check0(bcf_update_alleles(hdr, rec, NULL, 0)); + check0(check_alleles(rec, NULL, 0)); + // Expand to three + check0(bcf_update_alleles(hdr, rec, alleles2, 3)); + check0(check_alleles(rec, alleles2, 3)); + // Now try some bigger ones (should force a realloc) + check0(bcf_update_alleles(hdr, rec, alleles3, 3)); + check0(check_alleles(rec, alleles3, 3)); + // Ensure it works even if one of the alleles points into the + // existing structure + alleles4[1] = rec->d.allele[1]; + check0(bcf_update_alleles(hdr, rec, alleles4, 3)); + alleles4[1] = alleles3[1]; // Will have been clobbered by the update + check0(check_alleles(rec, alleles4, 3)); + // Ensure it works when the alleles point into the existing data, + // rec->d.allele is used to define the input array and the + // order of the entries is changed. The result of this should + // be the same as alleles2. + char *tmp = rec->d.allele[0] + strlen(rec->d.allele[0]) - 4; + rec->d.allele[0] = rec->d.allele[2] + strlen(rec->d.allele[2]) - 1; + rec->d.allele[2] = tmp; + check0(bcf_update_alleles(hdr, rec, (const char **) rec->d.allele, 3)); + check0(check_alleles(rec, alleles2, 3)); +} + void write_bcf(char *fname) { // Init @@ -114,10 +173,10 @@ void write_bcf(char *fname) // .. ID check0(bcf_update_id(hdr, rec, "rs6054257")); // .. REF and ALT + test_update_alleles(hdr, rec); const char *alleles[2] = { "G", "A" }; - check0(bcf_update_alleles(hdr, rec, alleles, 2)); - check0(bcf_update_alleles(hdr, rec, NULL, 0)); check0(bcf_update_alleles_str(hdr, rec, "G,A")); + check0(check_alleles(rec, alleles, 2)); // .. QUAL rec->qual = 29; // .. FILTER diff --git a/vcf.c b/vcf.c index a0c440202..980f8bff8 100644 --- a/vcf.c +++ b/vcf.c @@ -4443,28 +4443,64 @@ static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nal int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) { if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); - kstring_t tmp = {0,0,0}; char *free_old = NULL; - - // If the supplied alleles are not pointers to line->d.als, the existing block can be reused. + char buffer[256]; + size_t used = 0; + + // The pointers in alleles may point into the existing line->d.als memory, + // so care needs to be taken not to clobber them while updating. Usually + // they will be short so we can copy through an intermediate buffer. + // If they're longer, or won't fit in the existing allocation we + // can allocate a new buffer to write into. Note that in either case + // pointers to line->d.als memory in alleles may not be valid when we've + // finished. int i; - for (i=0; i=line->d.als && alleles[i]d.als+line->d.m_als ) break; - if ( i==nals ) - { - // all alleles point elsewhere, reuse the existing block - tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; - } - else + size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer); + for (i=0; id.m_als) // Don't shrink the buffer + needed = line->d.m_als; + if (needed > INT_MAX) { + hts_log_error("REF + alleles too long to fit in a BCF record"); + return -1; + } + new_als = malloc(needed); + if (!new_als) + return -1; free_old = line->d.als; + line->d.als = new_als; + line->d.m_als = needed; + } - for (i=0; id.m_als); + memcpy(line->d.als, buffer, used); } - line->d.als = tmp.s; line->d.m_als = tmp.m; - free(free_old); + + // Add in any remaining entries - if this happens we will always be + // writing to a newly-allocated buffer. + for (; i < nals; i++) { + size_t sz = strlen(alleles[i]) + 1; + memcpy(line->d.als + used, alleles[i], sz); + used += sz; + } + + if (free_old) + free(free_old); return _bcf1_sync_alleles(hdr,line,nals); } From 473f8d86411548b24819597a75d8f7f6cdb27629 Mon Sep 17 00:00:00 2001 From: Robert Davies Date: Mon, 8 Feb 2021 17:36:46 +0000 Subject: [PATCH 093/114] Change MACH_O compatibility and current versions When building libhts.dylib, the compatibility_version and current_version were not being set in the way documented by Apple - notably they expect current_version to be from the same sequence as, and greater than compatibility_version. See: https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html Change the values so current_version is related to the current HTSlib version, and compatibility_version the last one that introduced a backwards compatible change. As earlier HTSlib releases set compatibility_version to 3, we prepend '3.' to these numbers so programs linked to earlier versions will still work with libraries built after this change. This will be removed on the next ABI-changing update to the library, which will allow the compatibility_version and current_version sequences to be reset. Thanks to John Marshall for help with this issue. --- Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7b162f196..d2bdaf07f 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,13 @@ include htscodecs.mk PACKAGE_VERSION := $(shell ./version.sh) LIBHTS_SOVERSION = 3 -MACH_O_COMPATIBILITY_VERSION = $(LIBHTS_SOVERSION) + +# Version numbers for the Mac dynamic library. Note that the leading 3 +# is not strictly necessary and should be removed the next time +# LIBHTS_SOVERSION is bumped (see #1144 and +# https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) +MACH_O_COMPATIBILITY_VERSION = 3.1.11 +MACH_O_CURRENT_VERSION = 3.1.11 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -308,7 +314,7 @@ libhts.so: $(LIBHTS_OBJS:.o=.pico) # includes this project's build directory). libhts.dylib: $(LIBHTS_OBJS) - $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) + $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(MACH_O_CURRENT_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) From 26c9da239bc50bba2de0ad571ad3279c4dd8dd99 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 10 Feb 2021 11:27:22 -0700 Subject: [PATCH 094/114] Update vcf.h Fixed typo in example of bcf_get_format_*() funcitons. --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 6e476187a..732f25607 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1023,7 +1023,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * int max_ploidy = ngt/nsmpl; * for (i=0; i Date: Wed, 10 Feb 2021 10:13:37 +0000 Subject: [PATCH 095/114] Update htscodecs submodule - Fix off-by-one error in array bounds checking. Credit to OSS-Fuzz Fixes oss-fuzz 30381 Fixes oss-fuzz 30395 - Use printf in shell scripts instead of echo -n --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 0ffb50be0..306b3ebf9 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 0ffb50be07eab4ff40c8e6b7dc346266c6a23421 +Subproject commit 306b3ebf96890e71ca6e376f09ae6a1a0765ed99 From 449cfe1cdabef00a9533a44f7bda921038395924 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 16:44:43 +0000 Subject: [PATCH 096/114] Change travis to cirrus in .gitattributes --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index e46cc5cb4..a14bb82b1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,7 +9,7 @@ # Omit these files from release tarballs. /.appveyor.yml export-ignore .git* export-ignore -/.travis.yml export-ignore +/.cirrus.yml export-ignore README.md export-ignore # Remove the text attribute from reference files, so that git doesn't convert From 086881b291c827ad67e3d95ff605ed8094190bb3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 17:45:45 +0000 Subject: [PATCH 097/114] Add {check,distclean,test}-htslib phony targets to htslib.mk So distclean-all, check-all and test-all targets can be added to samtools/bcftools build systems. --- htslib.mk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/htslib.mk b/htslib.mk index 4d0fa0205..0c000a1a3 100644 --- a/htslib.mk +++ b/htslib.mk @@ -188,7 +188,8 @@ $(HTSDIR)/htslib.pc.tmp: # # clean: clean-htslib -all-htslib clean-htslib install-htslib plugins-htslib: +all-htslib check-htslib clean-htslib distclean-htslib install-htslib plugins-htslib test-htslib: +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) -.PHONY: all-htslib clean-htslib install-htslib plugins-htslib +.PHONY: all-htslib check-htslib clean-htslib distclean-htslib install-htslib +.PHONY: plugins-htslib test-htslib From 550c6b1b98414ef60eeb665cbfda9f6350d8907c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 17:53:55 +0000 Subject: [PATCH 098/114] Clarify that the htscodecs submodule is part of htslib The original message printed when the submodule files are not present could be confusing if HTSlib was being built as an embedded part of SAMtools. Alter the message slightly to make it clearer that the submodule needs to be updated in the htslib checkout. Fixes samtools/samtools#1364 --- Makefile | 4 ++-- configure.ac | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index d2bdaf07f..c61a18491 100644 --- a/Makefile +++ b/Makefile @@ -422,10 +422,10 @@ htscodecs/htscodecs/%.h: | htscodecs/htscodecs ; htscodecs/htscodecs: @if test -e .git ; then \ - printf "\\n\\nError: htscodecs submodule files not present.\\n\ + printf "\\n\\nError: htscodecs submodule files not present for htslib.\\n\ Try running: \\n\ git submodule update --init --recursive\\n\ - and then re-run make.\\n\\n\\n" ; \ + in the top-level htslib directory and then re-run make.\\n\\n\\n" ; \ else \ printf "\\n\\nError: htscodecs submodule files not present and this is not a git checkout.\\n\ You have an incomplete distribution. Please try downloading one of the\\n\ diff --git a/configure.ac b/configure.ac index d79947ae4..4d777e137 100644 --- a/configure.ac +++ b/configure.ac @@ -319,7 +319,7 @@ included as a submodule. Try running: git submodule update --init --recursive -to update it, and then re-run configure. +in the top-level htslib directory to update it, and then re-run configure. ])], [MSG_ERROR([htscodecs submodule files not present. From 680c0b8ef0ff133d3b572abc80fe66fc2ea965f0 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Feb 2021 14:01:58 +0000 Subject: [PATCH 099/114] Change CI scripts to use "autoreconf -i" and update docs autoconf 2.70 requires use of `autoreconf -i` for the AC_CANONICAL_HOST macro, which is pulled in by AC_FUNC_MMAP. Update CI configuration to use this, and change the instructions in the INSTALL and README.md files to match. --- .appveyor.yml | 2 +- .cirrus.yml | 2 +- INSTALL | 8 ++------ README.md | 3 +-- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index f8944daef..45550bade 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -34,7 +34,7 @@ build_script: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - git submodule update --init --recursive - - "sh -lc \"autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" + - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: # - make diff --git a/.cirrus.yml b/.cirrus.yml index 6b9bcd8cf..4df3dfd08 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -33,7 +33,7 @@ compile_template: &COMPILE fi if test "$USE_CONFIG" = "yes"; then MAKE_OPTS= - autoreconf + autoreconf -i eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ ( cat config.log; false ) else diff --git a/INSTALL b/INSTALL index 42bffb85b..fbe591d5a 100644 --- a/INSTALL +++ b/INSTALL @@ -17,6 +17,7 @@ In addition, building the configure script requires: autoheader autoconf + autoreconf Running the configure script uses awk, along with a number of standard UNIX tools (cat, cp, grep, mv, rm, sed, among others). Almost @@ -79,12 +80,7 @@ This step is only needed if configure.ac has been changed, or if configure does not exist (for example, when building from a git clone). The configure script and config.h.in can be built by running: - autoheader - autoconf - -If you have a full GNU autotools install, you can alternatively run: - - autoreconf + autoreconf -i Basic Installation ================== diff --git a/README.md b/README.md index e28f4a415..fccfa7775 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,7 @@ committed to this repository, so building the code from a Git repository requires extra steps: ```sh -autoheader # If using configure, generate the header template... -autoconf # ...and configure script (or use autoreconf to do both) +autoreconf -i # Build the configure script and install files it uses ./configure # Optional but recommended, for choosing extra functionality make make install From 10a6a8b869d7d58aa658fc5ca01f613321928944 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 16 Feb 2021 13:32:55 +0000 Subject: [PATCH 100/114] Allow a "remove tag" operation followed by "add the same tag" Fixes https://github.com/samtools/bcftools/issues/1414 (bcftools annotate can segfault when --remove and --annotations used at once) --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index 980f8bff8..565c28edd 100644 --- a/vcf.c +++ b/vcf.c @@ -4165,7 +4165,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v if ( inf ) { // Is it big enough to accommodate new block? - if ( str.l <= inf->vptr_len + inf->vptr_off ) + if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off ) { if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; uint8_t *ptr = inf->vptr - inf->vptr_off; @@ -4322,7 +4322,7 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const else { // The tag is already present, check if it is big enough to accommodate the new block - if ( str.l <= fmt->p_len + fmt->p_off ) + if ( fmt->p && str.l <= fmt->p_len + fmt->p_off ) { // good, the block is big enough if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; From 8127bfc98e9b4361dca2423fd42a59ad7c25dda7 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Feb 2021 16:49:59 +0000 Subject: [PATCH 101/114] Fix reading UTF-8 encoded sample names when char is signed The trick used in bcf_hdr_parse_sample_line() to rapidly find tabs and newlines could be defeated by UTF-8 characters outside the Basic Latin range on platforms where "char" is signed (like x86). It's currently not clear if VCF intends to allow these, but the 4.3 specification does allow UTF-8 and it's easy enough to support. Fix by casting to unsigned when making the comparison. Modifies formatcols.vcf to include a UTF-8 character for a round-trip test. Fixes samtools/bcftools#1408 --- test/formatcols.vcf | 2 +- vcf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/formatcols.vcf b/test/formatcols.vcf index c46cf46fe..6bbdb3af6 100644 --- a/test/formatcols.vcf +++ b/test/formatcols.vcf @@ -2,5 +2,5 @@ ##FILTER= ##contig= ##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S² S3 1 100 a A T . . . S a bbbbbbb ccccccccc diff --git a/vcf.c b/vcf.c index 565c28edd..81fc70c66 100644 --- a/vcf.c +++ b/vcf.c @@ -150,7 +150,7 @@ int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) const char *p, *q; // add samples for (p = q = str;; ++q) { - if (*q > '\n') continue; + if ((unsigned char) *q > '\n') continue; if (++i > 9) { if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1; } From a117153ee56eb6fd0a4f8933320591451c3395c9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 17 Feb 2021 17:04:19 +0000 Subject: [PATCH 102/114] Add mostlyclean-htslib and testclean-htslib targets to htslib.mk --- htslib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htslib.mk b/htslib.mk index 0c000a1a3..6b199e382 100644 --- a/htslib.mk +++ b/htslib.mk @@ -188,8 +188,8 @@ $(HTSDIR)/htslib.pc.tmp: # # clean: clean-htslib -all-htslib check-htslib clean-htslib distclean-htslib install-htslib plugins-htslib test-htslib: +all-htslib check-htslib clean-htslib distclean-htslib install-htslib mostlyclean-htslib plugins-htslib test-htslib testclean-htslib: +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) .PHONY: all-htslib check-htslib clean-htslib distclean-htslib install-htslib -.PHONY: plugins-htslib test-htslib +.PHONY: mostlyclean-htslib plugins-htslib test-htslib testclean-htslib From c6713293758bd8a852203bf08df0396a33d9a23c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 22 Feb 2021 15:23:30 +0000 Subject: [PATCH 103/114] Pull in htscodecs name tokeniser uninitialised memory access fix Credit to OSS-Fuzz Fixes oss-fuzz 31210 Fixes oss-fuzz 31215 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 306b3ebf9..2f481c822 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 306b3ebf96890e71ca6e376f09ae6a1a0765ed99 +Subproject commit 2f481c8227724c408fa7e7b2087457f74c041de9 From ee32bfb430dd15adf1291f2d46c26d2462c129ee Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 1 Mar 2021 12:34:56 +0000 Subject: [PATCH 104/114] Recognise legacy RAZF compression RAZF is an obsolete predecessor to BGZF, and is similarly a variant of GZIP using an extra header field. It also adds a trailing index table. Adding this htsCompression value does not affect bgzf_read_init()'s detection of BGZF vs plain-GZIP; RAZF remains treated as is_gzip and the trailing index table is not handled well, leading to problems if you try to decompress such a legacy file with e.g. bgzip -d. --- hts.c | 10 ++++++++-- htslib/hts.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index 57ee4d1d6..fe5d971ad 100644 --- a/hts.c +++ b/hts.c @@ -468,8 +468,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { // The stream is either gzip-compressed or BGZF-compressed. // Determine which, and decompress the first few records or lines. - fmt->compression = (len >= 18 && (s[3] & 4) && - memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; + fmt->compression = gzip; + if (len >= 18 && (s[3] & 4)) { + if (memcmp(&s[12], "BC\2\0", 4) == 0) + fmt->compression = bgzf; + else if (memcmp(&s[12], "RAZF", 4) == 0) + fmt->compression = razf_compression; + } if (len >= 9 && s[2] == 8) fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; @@ -664,6 +669,7 @@ char *hts_format_description(const htsFormat *format) switch (format->compression) { case bzip2_compression: kputs(" bzip2-compressed", &str); break; + case razf_compression: kputs(" legacy-RAZF-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; case bgzf: diff --git a/htslib/hts.h b/htslib/hts.h index 9488a9f9b..a0bb7e472 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -209,7 +209,7 @@ enum htsExactFormat { }; enum htsCompression { - no_compression, gzip, bgzf, custom, bzip2_compression, + no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression, compression_maximum = 32767 }; From 2abfea362eab837500b2c492dcbc47f5688d318c Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 2 Mar 2021 12:15:47 +0000 Subject: [PATCH 105/114] Refuse to read the legacy RAZF compression format Instead emit an error message recommending the use of gunzip to decompress the file, in the unlikely event a RAZF file is encountered. If seeking is available, attempt to read the sizes stored at the end of the RAZF trailing index table so that the message can show a truncate command to remove the index table before gunzipping the file. --- bgzf.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/bgzf.c b/bgzf.c index ded3884e4..6fa6105fe 100644 --- a/bgzf.c +++ b/bgzf.c @@ -49,6 +49,10 @@ #include "cram/pooled_alloc.h" #include "hts_internal.h" +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif + #define BGZF_CACHE #define BGZF_MT @@ -315,6 +319,37 @@ static inline void packInt32(uint8_t *buffer, uint32_t value) buffer[3] = value >> 24; } +static void razf_info(hFILE *hfp, const char *filename) +{ + uint64_t usize, csize; + off_t sizes_pos; + + if (filename == NULL || strcmp(filename, "-") == 0) filename = "FILE"; + + // RAZF files end with USIZE,CSIZE stored as big-endian uint64_t + if ((sizes_pos = hseek(hfp, -16, SEEK_END)) < 0) goto no_sizes; + if (hread(hfp, &usize, 8) != 8 || hread(hfp, &csize, 8) != 8) goto no_sizes; + if (!ed_is_big()) ed_swap_8p(&usize), ed_swap_8p(&csize); + if (csize >= sizes_pos) goto no_sizes; // Very basic validity check + + hts_log_error( +"To decompress this file, use the following commands:\n" +" truncate -s %" PRIu64 " %s\n" +" gunzip %s\n" +"The resulting uncompressed file should be %" PRIu64 " bytes in length.\n" +"If you do not have a truncate command, skip that step (though gunzip will\n" +"likely produce a \"trailing garbage ignored\" message, which can be ignored).", + csize, filename, filename, usize); + return; + +no_sizes: + hts_log_error( +"To decompress this file, use the following command:\n" +" gunzip %s\n" +"This will likely produce a \"trailing garbage ignored\" message, which can\n" +"usually be safely ignored.", filename); +} + static const char *bgzf_zerr(int errnum, z_stream *zs) { static char buffer[32]; @@ -352,7 +387,7 @@ static const char *bgzf_zerr(int errnum, z_stream *zs) } } -static BGZF *bgzf_read_init(hFILE *hfpr) +static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) { BGZF *fp; uint8_t magic[18]; @@ -368,6 +403,13 @@ static BGZF *bgzf_read_init(hFILE *hfpr) fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b); fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1; + if (fp->is_compressed && (magic[3]&4) && memcmp(&magic[12], "RAZF", 4)==0) { + hts_log_error("Cannot decompress legacy RAZF format"); + razf_info(hfpr, filename); + free(fp); + errno = EFTYPE; + return NULL; + } #ifdef BGZF_CACHE if (!(fp->cache = malloc(sizeof(*fp->cache)))) { free(fp); @@ -450,7 +492,7 @@ BGZF *bgzf_open(const char *path, const char *mode) if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hopen(path, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, path); if (fp == 0) { hclose_abruptly(fpr); return NULL; } fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -473,7 +515,7 @@ BGZF *bgzf_dopen(int fd, const char *mode) if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hdopen(fd, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, NULL); if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -494,7 +536,7 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode) BGZF *fp = NULL; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { - fp = bgzf_read_init(hfp); + fp = bgzf_read_init(hfp, NULL); if (fp == NULL) return NULL; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { fp = bgzf_write_init(mode); From b6ccfa641431523a4ee583b34f8a9817f44535a9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 25 Feb 2021 16:36:05 +0000 Subject: [PATCH 106/114] Update to htscodecs release 1.0 Updates the htscodecs submodule and updates dependencies for the new header files. Adds Makefile infrastructure to build and clean up the htscodecs/htscodecs/version.h file, for git checkouts only. Release tarballs will ship with a pre-built copy of this file. Fixes issue where the Makefile rules to test if the submodule is missing would cause missing source files to be silently ignored. They now run a recipe to check for the target, and suggest updating the submodule if it's missing. Adds documentation for the '--with-external-htscodecs' configure option to INSTALL. The hts_version() function is unchanged reporting pure htslib only. Use hts_test_feature(HTS_FEATURE_HTSCODECS) to query htscodecs version. --- INSTALL | 8 ++++++++ Makefile | 37 +++++++++++++++++++++++++++++++++---- hts.c | 25 ++++++++++++++++++------- htscodecs | 2 +- htscodecs_bundled.mk | 3 +++ htscodecs_external.mk | 2 ++ htslib/hts.h | 1 + test/test_introspection.c | 4 ++++ 8 files changed, 70 insertions(+), 12 deletions(-) diff --git a/INSTALL b/INSTALL index fbe591d5a..277be56d2 100644 --- a/INSTALL +++ b/INSTALL @@ -73,6 +73,10 @@ either clone the project using "git clone --recurse-submodules", or run: to ensure the correct version of the submodule is present. +It is also possible to link against an external libhtscodecs library +by using the '--with-external-htscodecs' configure option. When +this is used, the submodule files will be ignored. + Building Configure ================== @@ -140,6 +144,10 @@ various features and specify further optional external requirements: searched; you can use --with-plugin-path='DIR:$(plugindir):DIR' and so on to cause additional directories to be searched. +--with-external-htscodecs + Build and link against an external copy of the htscodecs library + instead of using the source files in the htscodecs directory. + --enable-libcurl Use libcurl () to implement network access to remote files via FTP, HTTP, HTTPS, etc. diff --git a/Makefile b/Makefile index c61a18491..24bb2295d 100644 --- a/Makefile +++ b/Makefile @@ -358,7 +358,7 @@ hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstrin hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) @@ -394,6 +394,7 @@ thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) @@ -417,9 +418,19 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl # Runes to check that the htscodecs submodule is present ifdef HTSCODECS_SOURCES htscodecs/htscodecs/%.c: | htscodecs/htscodecs - ; + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + htscodecs/htscodecs/%.h: | htscodecs/htscodecs - ; + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + htscodecs/htscodecs: @if test -e .git ; then \ printf "\\n\\nError: htscodecs submodule files not present for htslib.\\n\ @@ -432,6 +443,21 @@ htscodecs/htscodecs: official releases from https://www.htslib.org/\\n" ; \ fi @false + +# Build the htscodecs/htscodecs/version.h file if necessary +htscodecs/htscodecs/version.h: force + @if test -e htscodecs/.git && test -e htscodecs/configure.ac ; then \ + cd htscodecs && \ + vers=`git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ + case "$$vers" in \ + v*) vers=$${vers#v} ;; \ + *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ + esac ; \ + if ! grep -s -q '"'"$$vers"'"' htscodecs/version.h ; then \ + echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \ + echo '#define HTSCODECS_VERSION_TEXT "'"$$vers"'"' > htscodecs/version.h ; \ + fi ; \ + fi endif # Maintainer source code checks @@ -705,9 +731,12 @@ testclean: -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out +# Only remove this in git checkouts +DEL_HTSCODECS_VERSION := $(if $(wildcard htscodecs/.git),htscodecs/htscodecs/version.h) + mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM config_vars.h version.h - -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico + -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico $(DEL_HTSCODECS_VERSION) -rm -f hts-object-files -rm -f htscodecs/tests/*.o diff --git a/hts.c b/hts.c index fe5d971ad..7842d92f3 100644 --- a/hts.c +++ b/hts.c @@ -58,6 +58,11 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" #include "htslib/ksort.h" #include "htslib/tbx.h" +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "htscodecs/htscodecs/htscodecs.h" +#endif #ifndef EFTYPE #define EFTYPE ENOEXEC @@ -74,7 +79,7 @@ const char *hts_version() } unsigned int hts_features(void) { - unsigned int feat = 0; + unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present #ifdef PACKAGE_URL feat |= HTS_FEATURE_CONFIGURE; @@ -132,6 +137,9 @@ const char *hts_test_feature(unsigned int id) { case HTS_FEATURE_LZMA: return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + case HTS_FEATURE_HTSCODECS: + return htscodecs_version(); + case HTS_FEATURE_CC: return HTS_CC; case HTS_FEATURE_CFLAGS: @@ -151,6 +159,7 @@ const char *hts_test_feature(unsigned int id) { // Note this implementation also means we can just "strings" the library // to find the configuration parameters. const char *hts_feature_string(void) { + static char config[1200]; const char *fmt= #ifdef PACKAGE_URL @@ -196,18 +205,20 @@ const char *hts_feature_string(void) { #endif #ifdef HAVE_LIBBZ2 - "bzip2=yes "; + "bzip2=yes " #else - "bzip2=no "; + "bzip2=no " #endif + "htscodecs=%.40s"; + #ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, hts_plugin_path()); - return config; + snprintf(config, sizeof(config), fmt, + hts_plugin_path(), htscodecs_version()); #else - return fmt; + snprintf(config, sizeof(config), fmt, htscodecs_version()); #endif + return config; } diff --git a/htscodecs b/htscodecs index 2f481c822..30bc9fdca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 2f481c8227724c408fa7e7b2087457f74c041de9 +Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index de4d5db8d..7242e210b 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -25,6 +25,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ + $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ @@ -36,6 +37,7 @@ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) # htscodecs public headers htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_htscodecs_h = htscodecs/htscodecs/htscodecs.h $(htscodecs_version_h) htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h @@ -51,6 +53,7 @@ htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) htscodecs_utils_h = htscodecs/htscodecs/utils.h +htscodecs_version_h = htscodecs/htscodecs/version.h # Add htscodecs tests into the HTSlib test framework diff --git a/htscodecs_external.mk b/htscodecs_external.mk index ce24dd6f5..3f86811f5 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -28,6 +28,7 @@ HTSCODECS_TEST_TARGETS = htscodecs_arith_dynamic_h = htscodecs_fqzcomp_qual_h = +htscodecs_htscodecs_h = htscodecs_pack_h = htscodecs_rANS_static_h = htscodecs_rANS_static4x16_h = @@ -42,3 +43,4 @@ htscodecs_pooled_alloc_h = htscodecs_rANS_byte_h = htscodecs_rANS_word_h = htscodecs_utils_h = +htscodecs_version_h = diff --git a/htslib/hts.h b/htslib/hts.h index a0bb7e472..d4365e6e3 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -483,6 +483,7 @@ const char *hts_feature_string(void); #define HTS_FEATURE_LIBDEFLATE (1u<<20) #define HTS_FEATURE_LZMA (1u<<21) #define HTS_FEATURE_BZIP2 (1u<<22) +#define HTS_FEATURE_HTSCODECS (1u<<23) // htscodecs library version // Build params #define HTS_FEATURE_CC (1u<<27) diff --git a/test/test_introspection.c b/test/test_introspection.c index cc8ceb6e7..658d92f92 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -31,6 +31,8 @@ DEALINGS IN THE SOFTWARE. */ int main(void) { printf("Version string: %s\n", hts_version()); printf("Version number: %d\n", HTS_VERSION); + printf("\nhtscodecs version: %s\n", + hts_test_feature(HTS_FEATURE_HTSCODECS)); printf("\nCC: %s\n", hts_test_feature(HTS_FEATURE_CC)); printf("CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); @@ -55,6 +57,8 @@ int main(void) { printf(" HTS_FEATURE_LZMA\n"); if (feat & HTS_FEATURE_BZIP2) printf(" HTS_FEATURE_BZIP2\n"); + if (feat & HTS_FEATURE_HTSCODECS) + printf(" HTS_FEATURE_HTSCODECS\n"); printf("\nFeature string: %s\n", hts_feature_string()); From 0380c7b4f35f576e3d9b94a27dfe10f22ecdb620 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 3 Mar 2021 23:19:51 +0000 Subject: [PATCH 107/114] When not using configure, define _XOPEN_SOURCE (PR #1246) Non-configure -std=c99 builds (as opposed to -std=gnu99 builds), e.g., `make CC='gcc -std=c99'`, previously failed as glibc (and maybe others) suppresses non-Standard-C functions in standard headers in this mode. This reactivates them. In particular: Rhtslib, Rsamtools, and other R-based builds do not use configure and don't supply their own config.h, and may specify -std=c99. (500 suffices for glibc, but macOS's headers require 600 to have them provide declarations for strdup() and snprintf(). This appears to be a bug related to their expected _C99_SOURCE define, which isn't defined.) --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 24bb2295d..a3c0748b4 100644 --- a/Makefile +++ b/Makefile @@ -239,6 +239,9 @@ thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) # but if those aren't used create a default config.h here. config.h: echo '/* Default config.h generated by Makefile */' > $@ + echo '#ifndef _XOPEN_SOURCE' >> $@ + echo '#define _XOPEN_SOURCE 600' >> $@ + echo '#endif' >> $@ echo '#define HAVE_LIBBZ2 1' >> $@ echo '#define HAVE_LIBLZMA 1' >> $@ echo '#ifndef __APPLE__' >> $@ From 848b301d6f04a44fa49b08987085eb040bbe1fc3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 3 Mar 2021 23:35:01 +0000 Subject: [PATCH 108/114] Fix leak of uncompressed_block on bgzf_read_init() error Detected by making an RAZF file, but it could also leak on failure to create the cache. Credit to OSS-Fuzz Fixes oss-fuzz 31665 --- bgzf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bgzf.c b/bgzf.c index 6fa6105fe..b2ee7c869 100644 --- a/bgzf.c +++ b/bgzf.c @@ -406,16 +406,19 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) if (fp->is_compressed && (magic[3]&4) && memcmp(&magic[12], "RAZF", 4)==0) { hts_log_error("Cannot decompress legacy RAZF format"); razf_info(hfpr, filename); + free(fp->uncompressed_block); free(fp); errno = EFTYPE; return NULL; } #ifdef BGZF_CACHE if (!(fp->cache = malloc(sizeof(*fp->cache)))) { + free(fp->uncompressed_block); free(fp); return NULL; } if (!(fp->cache->h = kh_init(cache))) { + free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; From 1949a15a544ce5fb1ea48b53d7860df035a09f93 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 4 Mar 2021 17:33:29 +0000 Subject: [PATCH 109/114] Add options to force S3 address style. (PR #1249) Add options to force S3 address style. (PR #1249) Allow S3 configuration options to force path-style URLs, for local set-ups that do not support virtual hosts. Fixes samtools/samtools#1377 --- hfile_s3.c | 54 +++++++++++++++++++++++++++++++++++++++++++--- htslib-s3-plugin.7 | 19 +++++++++++++++- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/hfile_s3.c b/hfile_s3.c index eeb355065..4f1d536dc 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include @@ -503,6 +504,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, ptrdiff_t bucket_len; int is_https = 1, dns_compliant; char *query_start; + enum {s3_auto, s3_virtual, s3_path} address_style = s3_auto; if (!ad) return NULL; @@ -555,29 +557,75 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); else kputs("default", &profile); + + if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) { + if (strcasecmp(v, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcasecmp(v, "path") == 0) { + address_style = s3_path; + } + } } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); parse_ini(v? v : "~/.aws/credentials", profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, - "region", &ad->region, NULL); + "region", &ad->region, + "addressing_style", &url_style, + NULL); + + if (url_style.l) { + if (strcmp(url_style.s, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcmp(url_style.s, "path") == 0) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + + ks_free(&url_style); } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; const char *v = getenv("HTS_S3_S3CFG"); parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &ad->host, - "bucket_location", &ad->region, NULL); + "bucket_location", &ad->region, + "host_bucket", &url_style, + NULL); + + if (url_style.l) { + // Conforming to s3cmd's GitHub PR#416, host_bucket without the "%(bucket)s" string + // indicates use of path style adressing. + if (strstr(url_style.s, "%(bucket)s") == NULL) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + + ks_free(&url_style); } if (ad->id.l == 0) parse_simple("~/.awssecret", &ad->id, &ad->secret); - dns_compliant = is_dns_compliant(bucket, path, is_https); + + // if address_style is set, force the dns_compliant setting + if (address_style == s3_virtual) { + dns_compliant = 1; + } else if (address_style == s3_path) { + dns_compliant = 0; + } else { + dns_compliant = is_dns_compliant(bucket, path, is_https); + } if (ad->host.l == 0) kputs("s3.amazonaws.com", &ad->host); diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index ea8eefa39..022003afb 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -2,7 +2,7 @@ .SH NAME s3 plugin \- htslib AWS S3 plugin .\" -.\" Copyright (C) 2019 Genome Research Ltd. +.\" Copyright (C) 2021 Genome Research Ltd. .\" .\" Author: Andrew Whitwham .\" @@ -105,16 +105,33 @@ Sets the upload part size in Mb, the minimum being 5Mb. By default the part size starts at 5Mb and expands at regular intervals to accommodate bigger files (up to 2.5 Tbytes with the current rate). Using this setting disables the automatic part size expansion. +.TP +.B HTS_S3_ADDRESS_STYLE +Sets the URL style. Options are auto (default), virtual or path. .LP In the absence of an ID from the previous two methods the credential/config files will be used. The default file locations are either \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). + +Entries used in aws style credentials file are aws_access_key_id, +aws_secret_access_key, aws_session_token, region and addressing_style. Only the +first two are usually needed. + +Entries used in s3cmd style config files are access_key, secret_key, +access_token, host_base, bucket_location and host_bucket. Again only the first +two are usually needed. The host_bucket option is only used to set a path-style +URL, see below. + .SH NOTES In most cases this plugin transforms the given URL into a virtual host-style format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used where the URL is not DNS compliant or the bucket name contains a dot e.g. \fIhttps://host/bu.cket/path/to/file\fR. +Path-style can be forced by setting one either HTS_S3_ADDRESS_STYLE, +addressing_style or host_bucket. The first two can be set to \fBpath\fR while +host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. + .SH "SEE ALSO" .BR htsfile (1) .BR samtools (1) From f518a265cfeb575c0d93f6090eea863e5ad4a914 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 2 Feb 2021 13:59:33 +0000 Subject: [PATCH 110/114] News update for the next release (1.12). --- NEWS | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/NEWS b/NEWS index 658b7cad2..170bacee6 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,36 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Features and Updates +-------------------- + +* Added experimental CRAM 3.1 and 4.0 support. (#929) + + These should not be used for long term data storage as the + specification still needs to be ratified by GA4GH and may be subject + to changes in format. (This is highly likely for 4.0). However it + may be tested using: + + test/test_view -t ref.fa -C -o version=3.1 in.bam -p out31.cram + + For smaller but slower files, try varying the compression profile + with an additional "-o small". Profile choices are fast, normal, + small and archive, and can be applied to all CRAM versions. + +* Added a general filtering syntax for alignment records in SAM/BAM/CRAM + readers. (#1181, #1203) + + An example to find chromosome spanning read-pairs with high mapping + quality: 'mqual >= 30 && mrname != rname' + + To find significant sized deletions: + 'cigar =~ "[0-9]{2}D"' or 'rlen - qlen > 10'. + + To report duplicates that aren't part of a "proper pair": + 'flag.dup && !flag.proper_pair' + + More details are in the samtools.1 man page under "FILTER EXPRESSIONS". + * The knet networking code has been removed. It only supported the http and ftp protocols, and a better and safer alternative using libcurl has been available since release 1.3. If you need access to ftp:// and @@ -9,6 +39,117 @@ Noteworthy changes in release a.b * The old htslib/knetfile.h interfaces have been marked as deprecated. Any code still using them should be updated to use hFILE instead. (#1200) +* Added an introspection API for checking some of the capabilities provided + by HTSlib. (#1170) Thanks also to John Marshall for contributions. (#1222) + - `hfile_list_schemes`: returns the number of schemes found + - `hfile_list_plugins`: returns the number of plugins found + - `hfile_has_plugin`: checks if a specific plugin is available + - `hts_features`: returns a bit mask with all available features + - `hts_test_feature`: test if a feature is available + - `hts_feature_string`: return a string summary of enabled features + +* Made performance improvements to `probaln_glocal` method, which + speeds up mpileup BAQ calculations. (#1188) + - Caching of reused loop variables and removal of loop invariants + - Code reordering to remove instruction latency. + - Other refactoring and tidyups. + +* Added a public method for constructing a BAM record from the + component pieces. Thanks to Anders Kaplan. (#1159, #1164) + +* Added two public methods, `sam_parse_cigar` and `bam_parse_cigar`, as part of + a small CIGAR API (#1169, #1182). Thanks to Daniel Cameron for input. (#1147) + +* HTSlib, and the included htsfile program, will now recognise the old + RAZF compressed file format. Note that while the format is detected, + HTSlib is unable to read it. It is recommended that RAZF files are + uncompressed with `gunzip` before using them with HTSlib. Thanks to + John Marshall (#1244); and Matthew J. Oldach who reported problems + with uncompressing some RAZF files (samtools/samtools#1387). + +* The S3 plugin now has options to force the address style. It will recognise + the addressing_style and host_bucket entries in the respective aws + .credentials and s3cmd .s3cfg files. There is also a new HTS_S3_ADDRESS_STYLE + environment variable. Details are in the htslib-s3-plugin.7 man file (#1249). + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Added new Makefile targets for the applications that embed HTSlib and + want to run its test suite or clean its generated artefacts. (#1230, #1238) + +* The CRAM codecs are now obtained via the htscodecs submodule, hence + when cloning it is now best to use "git clone --recursive". In an + existing clone, you may use "git submodule update --init" to obtain + the htscodecs submodule checkout. + +* Updated CI test configuration to recurse HTSlib submodules. (#1359) + +* Added Cirrus-CI integration as a replacement for Travis, which was + phased out. (#1175; #1212) + +* Updated the Windows image used by Appveyor to 'Visual Studio 2019'. (#1172; + fixed #1166) + +* Fixed a buglet in configure.ac, exposed by the release 2.70 of autoconf. + Thanks to John Marshall. (#1198) + +* Fixed plugin linking on macOS, to prevent symbol conflict when linking + with a static HTSlib. Thanks to John Marshall. (#1184) + +* Fixed a clang++9 error in `cram_io.h`. Thanks to Pjotr Prins. (#1190) + +* Introduced $(ALL_CPPFLAGS) to allow for more flexibility in setting the + compiler flags. Thanks to John Marshall. (#1187) + +* Added 'fall through' comments to prevent warnings issued by Clang on + intentional fall through case statements, when building with + `-Wextra flag`. Thanks to John Marshall. (#1163) + +* Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work + when the `gcc -std=c99` option is used. Thanks to John Marshall. (#1246) + +Bug fixes +--------- + +* Fixed VCF `#CHROM` header parsing to only separate columns at tab characters. + Thanks to Sam Morris for reporting the issue. + (#1237; fixed samtools/bcftools#1408) + +* Fixed a crash reported in `bcf_sr_sort_set`, which expects REF to be present. + (#1204; fixed samtools/bcftools#1361) + +* Fixed a bug in the overlapping logic of mpileup, dealing with iterating over + CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) + +* Fixed a tabix bug that prevented setting the correct number of lines to be + skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; + fixed #1186) + +* Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing + when working with htsFile pointers. Thanks to Torbjörn Klatt for + reporting it. (#1180; fixed #1179) + +* Fixed once per outgoing multi-threaded block `bgzf_idx_flush` assertion, to + accommodate situations when a single record could span multiple blocks. + Thanks to `@lacek`. (#1168; fixed samtools/samtools#1328) + +* Fixed assumption of pthread_t being a non-structure, as permitted by POSIX. + Thanks also to John Marshall and Anders Kaplan. (#1167, #1153, #1153) + +* Fixed the minimum offset of a BAI index bin, to account for unmapped reads. + Thanks to John Marshall for spotting the issue. (#1158; fixed #1142) + +* Fixed the CRLF handling in `sam_parse_worker` method. Thanks to + Anders Kaplan. (#1149; fixed #1148) + +* Included unistd.h and erro.h directly in HTSlib files, as opposed to + including them indirectly, via third party code. Thanks to + Andrew Patterson (#1143) and John Marshall (#1145). + + Noteworthy changes in release 1.11 (22nd September 2020) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 6038f97e901cc2be832cc7749200faf375b287e4 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 5 Mar 2021 17:46:17 +0000 Subject: [PATCH 111/114] Fix NEWS typo --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 170bacee6..b70533915 100644 --- a/NEWS +++ b/NEWS @@ -145,7 +145,7 @@ Bug fixes * Fixed the CRLF handling in `sam_parse_worker` method. Thanks to Anders Kaplan. (#1149; fixed #1148) -* Included unistd.h and erro.h directly in HTSlib files, as opposed to +* Included unistd.h and errno.h directly in HTSlib files, as opposed to including them indirectly, via third party code. Thanks to Andrew Patterson (#1143) and John Marshall (#1145). From 061ef366d54b4165777553023a76922b54bf236f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Mar 2021 10:06:51 +0000 Subject: [PATCH 112/114] Fix a bug where target regions could miss the first record This could happen with e.g. with `bcftools view -t` when the last record in a chromosome block has the same coordinate as the first record in the chromosome block that follows. Fixes https://github.com/samtools/bcftools/issues/1441 --- synced_bcf_reader.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index f51723300..10604b16c 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -614,7 +614,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } reader->nbuffer++; - if ( files->require_index==ALLOW_NO_IDX_ && reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; + if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) @@ -638,11 +638,12 @@ static void _reader_shift_buffer(bcf_sr_t *reader) { int i; for (i=2; i<=reader->nbuffer; i++) - if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; + if ( reader->buffer[i]->rid!=reader->buffer[1]->rid || reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; if ( i<=reader->nbuffer ) { // A record with a different position follows, swap it. Because of the reader's logic, // only one such line can be present. + assert( i==reader->nbuffer ); bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; reader->nbuffer = 1; } From 41e11f96486189fce8f6c7b8064d83476ab7a2bc Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 12 Mar 2021 11:53:23 +0000 Subject: [PATCH 113/114] NEWS item for #1254 --- NEWS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS b/NEWS index b70533915..fd240f132 100644 --- a/NEWS +++ b/NEWS @@ -121,6 +121,10 @@ Bug fixes * Fixed a crash reported in `bcf_sr_sort_set`, which expects REF to be present. (#1204; fixed samtools/bcftools#1361) +* Fixed a bcf synced reader bug when filtering with a region list, and + the first record for a chromosome had the same position as the last + record for the previous chromosome. (#1254; fixed samtools/bcftools#1441) + * Fixed a bug in the overlapping logic of mpileup, dealing with iterating over CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) From 718695931de92f3e5f8659db46d15a16481972a9 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 9 Mar 2021 13:58:37 +0000 Subject: [PATCH 114/114] Improve "CRAM version 4.0 is still draft" wording In particular, shorten to two lines and improve the line break. --- cram/cram_io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 8099a9287..05f7c7b06 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5688,10 +5688,9 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { if (major > 3 || (major == 3 && minor > 0)) { hts_log_warning( - "CRAM version %s is still in draft and is subject to\n" - "change. Please consider this a technology demonstration " - "and do not use for\n" - "long term archival of data.", s); + "CRAM version %s is still a draft and subject to change.\n" + "This is a technology demonstration that should not be " + "used for archival data.", s); } fd->version = major*256 + minor;