From 7981bc9356899ea5dbf076d432e9645ee55df1d6 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 22 Feb 2022 11:21:16 +0000
Subject: [PATCH 01/79] Fix missing parameter in hts_log() calls

The context parameter was omitted, causing the format string
to be used for it instead, and the next one to be used as the
format.  This resulted in not very useful error messages and
possible issues with incorrect interpretation of varargs
parameters.
---
 realn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/realn.c b/realn.c
index f05db6ac4..5354dee78 100644
--- a/realn.c
+++ b/realn.c
@@ -91,12 +91,12 @@ int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres)
 static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity,
                            const char *type, const bam1_t *b) {
     if (*tg != 'Z') {
-        hts_log(severity, "Incorrect %s tag type (%c) for read %s",
+        hts_log(severity, __func__, "Incorrect %s tag type (%c) for read %s",
                 type, *tg, bam_get_qname(b));
         return -1;
     }
     if (b->core.l_qseq != strlen((const char *) tg + 1)) {
-        hts_log(severity, "Read %s %s tag is wrong length",
+        hts_log(severity, __func__, "Read %s %s tag is wrong length",
                 bam_get_qname(b), type);
         return -1;
     }

From 0d83a7b2d7aa961b5589edbd2a244ddaa6520282 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Wed, 23 Feb 2022 09:54:22 +0000
Subject: [PATCH 02/79] Use a constant printf format string in
 hts_feature_string() [minor]

Refactor this code to avoid a -Wformat-nonliteral warning.
---
 hts.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/hts.c b/hts.c
index ddaa60bfb..30c0c6890 100644
--- a/hts.c
+++ b/hts.c
@@ -168,7 +168,7 @@ const char *hts_test_feature(unsigned int id) {
 // to find the configuration parameters.
 const char *hts_feature_string(void) {
     static char config[1200];
-    const char *fmt=
+    const char *flags=
 
 #ifdef PACKAGE_URL
     "build=configure "
@@ -176,12 +176,6 @@ const char *hts_feature_string(void) {
     "build=Makefile "
 #endif
 
-#ifdef ENABLE_PLUGINS
-    "plugins=yes, plugin-path=%.1000s "
-#else
-    "plugins=no "
-#endif
-
 #ifdef HAVE_LIBCURL
     "libcurl=yes "
 #else
@@ -218,13 +212,21 @@ const char *hts_feature_string(void) {
     "bzip2=no "
 #endif
 
-    "htscodecs=%.40s";
+// "plugins=" must stay at the end as it is followed by "plugin-path="
+#ifdef ENABLE_PLUGINS
+    "plugins=yes";
+#else
+    "plugins=no";
+#endif
 
 #ifdef ENABLE_PLUGINS
-    snprintf(config, sizeof(config), fmt,
-             hts_plugin_path(), htscodecs_version());
+    snprintf(config, sizeof(config),
+             "%s plugin-path=%.1000s htscodecs=%.40s",
+             flags, hts_plugin_path(), htscodecs_version());
 #else
-    snprintf(config, sizeof(config), fmt, htscodecs_version());
+    snprintf(config, sizeof(config),
+             "%s htscodecs=%.40s",
+             flags, htscodecs_version());
 #endif
     return config;
 }

From d5a00db4049a4aff80d332f812863f7e501a0125 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 20 Jan 2022 16:28:18 +0000
Subject: [PATCH 03/79] Add more calls to bgzf_flush_try.

This makes bgzipped SAM, VCF, FASTA and FASTQ start blocks on a new
record (except for the case of a single record being too large to fit
in a single block).

It is a companion PR to #1369
---
 sam.c                 |  75 ++++++++++++++++++++++++++++++++++++++++--
 test/index.vcf.gz.csi | Bin 159 -> 168 bytes
 test/index.vcf.gz.tbi | Bin 213 -> 206 bytes
 vcf.c                 |  13 +++++---
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/sam.c b/sam.c
index 393a3b22e..55851134b 100644
--- a/sam.c
+++ b/sam.c
@@ -2910,6 +2910,10 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
 // Number of BAM records (writing)
 #define NB 1000
 
+// FIXME: this is too large for ONT data.
+// We should have NB as a maximum for allocation purposes, but bail out
+// early if it grows beyond NM so we have constant memory usage.
+
 struct SAM_state;
 
 // Output job - a block of BAM records
@@ -3444,6 +3448,8 @@ static void *sam_dispatcher_write(void *vp) {
                     i++;
 
                 if (fp->is_bgzf) {
+                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
+                        goto err;
                     if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
                         goto err;
                 } else {
@@ -3483,8 +3489,69 @@ static void *sam_dispatcher_write(void *vp) {
             pthread_mutex_unlock(&fd->lines_m);
         } else {
             if (fp->is_bgzf) {
-                if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size)
-                    goto err;
+                // We keep track of how much in the current block we have
+                // remaining => R.  We look for the last newline in input
+                // [i] to [i+R], backwards => position N.
+                //
+                // If we find a newline, we write out bytes i to N.
+                // We know we cannot fit the next record in this bgzf block,
+                // so we flush what we have and copy input N to i+R into
+                // the start of a new block, and recompute a new R for that.
+                //
+                // If we don't find a newline (i==N) then we cannot extend
+                // the current block at all, so flush whatever is in it now
+                // if it ends on a newline.
+                // We still copy i(==N) to i+R to the next block and
+                // continue as before with a new R.
+                //
+                // The only exception on the flush is when we run out of
+                // data in the input.  In that case we skip it as we don't
+                // yet know if the next record will fit.
+                //
+                // Both conditions share the same code here:
+                // - Look for newline (pos N)
+                // - Write i to N (which maybe 0)
+                // - Flush if block ends on newline and not end of input
+                // - write N to i+R
+
+                int i = 0;
+                BGZF *fb = fp->fp.bgzf;
+                while (i < gl->data_size) {
+                    // remaining space in block
+                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
+                    int eod = 0;
+                    if (R > gl->data_size-i)
+                        R = gl->data_size-i, eod = 1;
+
+                    // Find last newline in input data
+                    int N = i + R;
+                    while (--N > i) {
+                        if (gl->data[N] == '\n')
+                            break;
+                    }
+
+                    if (N != i) {
+                        // Found a newline
+                        N++;
+                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
+                            goto err;
+                    }
+
+                    // Flush bgzf block
+                    int b_off = fb->block_offset;
+                    if (!eod && b_off &&
+                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
+                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
+                            goto err;
+
+                    // Copy from N onwards into next block
+                    if (i+R > N)
+                        if (bgzf_write(fb, &gl->data[N], i+R - N)
+                            != i+R - N)
+                            goto err;
+
+                    i = i+R;
+                }
             } else {
                 if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
                     goto err;
@@ -4348,6 +4415,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
             if (sam_format1(h, b, &fp->line) < 0) return -1;
             kputc('\n', &fp->line);
             if (fp->is_bgzf) {
+                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
+                    return -1;
                 if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
             } else {
                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
@@ -4387,6 +4456,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
         if (fastq_format1(fp->state, b, &fp->line) < 0)
             return -1;
         if (fp->is_bgzf) {
+            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
+                return -1;
             if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
                 return -1;
         } else {
diff --git a/test/index.vcf.gz.csi b/test/index.vcf.gz.csi
index 644832d838ef520e66657129e63c51fda22d1cfe..250339624cc84613d53e0ad22fa389dfecf655ac 100644
GIT binary patch
delta 145
zcmV;C0B--E0jL3gABzYC000000RIL6LPG)oivVrTu?@mN5Jb^;2H^-IDu4zsEfAR$
ziDD=Nle++lpa?>mLE;XbeY#0TyMMH^r%N9;SYeL_A-S}J7M-K+V3F|ozVug&w_Dit
z(O>3zK5`~h@(pf7p2LKVWAw7K<e#GR_t%hlm}I%GL*-@($=~Yw0>fxspOJqcj@&^_

literal 159
zcmb2|=3rp}f&Xj_PR>jWO$^0+-%_3=CnO{=C#5Q+F(tWaoMjMYW-L=MOEE|?FiA*g
z^Q!huN^D>h_}^;Ka#oak;dkQ&O;>{jvLrUHV4oRiDtR#7Wm5Kio`o0Rx=i2So2Gt4
tda~fB9*-<Zi9;2V8)tOt)z4(KW?+~(m3f0TBLjmxniHfMn87Xr5dhsAFUSA@

diff --git a/test/index.vcf.gz.tbi b/test/index.vcf.gz.tbi
index 4d6e997815ffcbd270eb0eed118eaeb0cbb666a5..e9ab7b60d16f6bb7c4c6291de47d22b533bb34d7 100644
GIT binary patch
literal 206
zcmb2|=3rp}f&Xj_PR>jW8yVi-+{k;#K*aUp>wqPTl!Om3sVHA)mD<P<ZNSEz!XUOF
znZs+EV^^~3Hztn4_~Iq=mtTp$zuZ|q%z?GVv*B(E@0OA+zdv67I8CdRdBfYq{-5s5
z{;_*g`O0TAS3$um`$)#?zI$dbt-fjCa{2DsIr~puZr!;UBzwQG=F^;SyS8jUSSc%3
m9cx)X@%hFVD^|VApZSULuMm%Eu`&aLJerH88JNKy01*Jp!dIOD

literal 213
zcmb2|=3rp}f&Xj_PR>jWI~d;H+{k;#K%n*E^8f|Kz*gRfh7=a7AZ079Z3&E3*A^^U
zn9SiNmca99j{m_p2E~8sk@Jh}cG|^md+j^JW7RL=NrtDNY_ac+GF|;}m38##)!Sa*
zythaDd}`wPskQA{YZR5B;NRrJ^<`^v^Pm0RbouI+z0r~NTK>|<<XnK@Pv2qvt0vcf
u-cft|?Bng{yEgp`n0MQqUrFiTw0Y*tH;zdaD>E?2qd8offf?)%5CH(~Csr!}

diff --git a/vcf.c b/vcf.c
index 18c97662a..6e3f05388 100644
--- a/vcf.c
+++ b/vcf.c
@@ -2226,10 +2226,12 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
     }
     while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
     int ret;
-    if ( fp->format.compression!=no_compression )
+    if ( fp->format.compression!=no_compression ) {
         ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
-    else
+        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
+    } else {
         ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
+    }
     free(htxt.s);
     return ret<0 ? -1 : 0;
 }
@@ -3401,10 +3403,13 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
     fp->line.l = 0;
     if (vcf_format1(h, v, &fp->line) != 0)
         return -1;
-    if ( fp->format.compression!=no_compression )
+    if ( fp->format.compression!=no_compression ) {
+        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
+            return -1;
         ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
-    else
+    } else {
         ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
+    }
 
     if (fp->idx) {
         int tid;

From 5ccdc356d34aacc857937270fa938ce376120b8f Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 24 Jan 2022 15:51:22 +0000
Subject: [PATCH 04/79] Fix excessive memory used by multi-threaded SAM output
 on long reads.

The previous code here buffered up 1000 BAM records before dispatching
a job to convert to SAM.  We now buffer up either 1000 BAM records or
(estimated) ~240KB of encoded SAM, whichever is hit first.

Note these figures were previously written in macros NB and NM, which
have been replaced with more meaningful names.

I tested it with data having an average length of 148bp (Illumina)
15949bp (ONT) and 13441 (PB CCS), using the GIAB HG002 chr1 public data
sets.  Maximum lengths were 148, 1044875 and 19288.

/usr/bin/time real time and maxresident figures, before:

               Illumina              ONT               PB
    -@8     43,128/2m43   6,040,568/1m39   1,593,932/1m15
    -@16    70,880/1m42   8,710,132/1m08   3,026,604/0m52
    -@32    70,272/1m29  11,940,036/0m55   5,722,952/0m47
    -@64   190,584/1m21  17,007,840/0m56  10,835,512/0m48

After:

               Illumina              ONT               PB
    -@16    50,208/1m36     696,276/1m09      63,496/0m57
    -@32    86,044/1m21   1,054,524/0m53     109,696/0m44
    -@64   149,024/1m24   1,616,720/0m55     195,676/0m48

The effect on memory (KB) is vast, although it's still a bit higher on
ONT.  This is probably related to the maximum lengths being used
and the reuse of BAM structs (never shrinking them) rather than the
average, so a long tail on the distribution causes memory growth.

We could address that in later updates, but this is still a huge
improvement.

Obviously as we get to very long records, we'll be dispatching very
commonly (maybe every alignment), but I don't yet know how inefficient
the threading becomes then.  Memory usage will grow as we cannot store
half an alignment, but it won't explode so fast.  We may wish to try
larger values of SAM_NBYTES, but note I tried 100KB to 1MB and this
came out fairly optimal on CPU so there was little reason to trade
more memory for CPU.
---
 sam.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/sam.c b/sam.c
index 55851134b..b0ca2e974 100644
--- a/sam.c
+++ b/sam.c
@@ -2906,13 +2906,10 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
  * SAM threading
  */
 // Size of SAM text block (reading)
-#define NM 240000
-// Number of BAM records (writing)
-#define NB 1000
+#define SAM_NBYTES 240000
 
-// FIXME: this is too large for ONT data.
-// We should have NB as a maximum for allocation purposes, but bail out
-// early if it grows beyond NM so we have constant memory usage.
+// Number of BAM records (writing, up to NB_mem in size)
+#define SAM_NBAM 1000
 
 struct SAM_state;
 
@@ -2922,7 +2919,8 @@ typedef struct sp_bams {
     int serial;
 
     bam1_t *bams;
-    int nbams, abams; // used and alloc
+    int nbams, abams; // used and alloc for bams[] array
+    size_t bam_mem;   // very approximate total size
 
     struct SAM_state *fd;
 } sp_bams;
@@ -3173,6 +3171,7 @@ static void *sam_parse_worker(void *arg) {
             goto err;
         }
         gb->nbams = 0;
+        gb->bam_mem = 0;
     }
     gb->serial = gl->serial;
     gb->next = NULL;
@@ -3225,6 +3224,7 @@ static void *sam_parse_worker(void *arg) {
             cleanup_sp_lines(gl);
             goto err;
         }
+
         cp = nl;
         i++;
     }
@@ -3294,7 +3294,7 @@ static void *sam_dispatcher_read(void *vp) {
             l = calloc(1, sizeof(*l));
             if (!l)
                 goto err;
-            l->alloc = NM;
+            l->alloc = SAM_NBYTES;
             l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
             if (!l->data) {
                 free(l);
@@ -3305,11 +3305,11 @@ static void *sam_dispatcher_read(void *vp) {
         }
         l->next = NULL;
 
-        if (l->alloc < line_frag+NM/2) {
-            char *rp = realloc(l->data, line_frag+NM/2 +8);
+        if (l->alloc < line_frag+SAM_NBYTES/2) {
+            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
             if (!rp)
                 goto err;
-            l->alloc = line_frag+NM/2;
+            l->alloc = line_frag+SAM_NBYTES/2;
             l->data = rp;
         }
         memcpy(l->data, line.s, line_frag);
@@ -4369,16 +4369,18 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
                     fd->bams = gb->next;
                     gb->next = NULL;
                     gb->nbams = 0;
+                    gb->bam_mem = 0;
                     pthread_mutex_unlock(&fd->lines_m);
                 } else {
                     pthread_mutex_unlock(&fd->lines_m);
                     if (!(gb = calloc(1, sizeof(*gb)))) return -1;
-                    if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) {
+                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
                         free(gb);
                         return -1;
                     }
                     gb->nbams = 0;
-                    gb->abams = NB;
+                    gb->abams = SAM_NBAM;
+                    gb->bam_mem = 0;
                     gb->fd = fd;
                     fd->curr_idx = 0;
                     fd->curr_bam = gb;
@@ -4387,11 +4389,11 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
 
             if (!bam_copy1(&gb->bams[gb->nbams++], b))
                 return -2;
+            gb->bam_mem += b->l_data + sizeof(*b);
 
             // Dispatch if full
-            if (gb->nbams == NB) {
+            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
                 gb->serial = fd->serial++;
-                //fprintf(stderr, "Dispatch another %d bams\n", NB);
                 pthread_mutex_lock(&fd->command_m);
                 if (fd->errcode != 0) {
                     pthread_mutex_unlock(&fd->command_m);

From b92aac55b18e8dff8847707352b8b474519d97fb Mon Sep 17 00:00:00 2001
From: Alexandr Chernov <kstchernov@gmail.com>
Date: Fri, 25 Feb 2022 15:50:15 +0100
Subject: [PATCH 05/79] Adjusted types sam.h to avoid compiler warnings

---
 htslib/sam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/htslib/sam.h b/htslib/sam.h
index ea585a45a..45dd51f0a 100644
--- a/htslib/sam.h
+++ b/htslib/sam.h
@@ -1524,7 +1524,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
         ++s;
     } else if (type == 'B') {
         uint8_t sub_type = *(s++);
-        int sub_type_size;
+        unsigned sub_type_size;
 
         // or externalise sam.c's aux_type2size function?
         switch (sub_type) {
@@ -1547,7 +1547,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
             goto bad_aux;
         n = le_to_u32(s);
         s += 4; // now points to the start of the array
-        if ((end - s) / sub_type_size < n)
+        if ((size_t)(end - s) / sub_type_size < n)
             goto bad_aux;
         r |= kputsn_("B:", 2, ks) < 0;
         r |= kputc(sub_type, ks) < 0; // write the type

From 8caa1db4cfd8dc626a909f27a3051cb2c7b44a1a Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 23 Feb 2022 14:13:29 +0000
Subject: [PATCH 06/79] Add stricter format string checking to a gcc CI build

Following a tidy up in 7981bc93 and 0d83a7b2, all format strings
in HTSlib are now literals.  Prevent non-literal strings from
coming back by using -Wformat=2 in one of the CI tests, which
enables -Wformat-nonliteral.  This has to be a gcc test as it
excludes functions that take a va_list from the format-nonliteral
warning while clang doesn't, and we need to be able to pass
nonliteral format strings to these functions.

While this might be slightly inconvenient for developers, any
annoyances are far outweighed by being able to automatically
detect a class of nasty and otherwise difficult to spot bugs.
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 83e36af9c..1ba352063 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -72,7 +72,7 @@ gcc_task:
        USE_CONFIG: no
     - environment:
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic
+       CFLAGS: -std=c99 -pedantic -Wformat=2
        USE_LIBDEFLATE: yes
 
   << : *LIBDEFLATE

From a2db7ed2d508991e4adf289622e4a58234a0aeb0 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 1 Apr 2022 09:14:51 +0100
Subject: [PATCH 07/79] Improve coordinate parsing, particularly with tabix.
 (PR #1411)

Double check the file coordinates are 1-based unless tabix -0 is
specified.  We treat a 0 coordinate as a warning now.  The
documentation is also more explicit that -0 also implied a half-open
coordinate system.

Also improve region parsing.  The code supports regions such as
"x"/"x:" (all of "x"), "x:10-20" "x:-20" (up to pos 20) and "x:20-"
(position 20 onwards).  However as genome coordinates are 1-based and
internally we have 0-based, we subtract one during parsing.  The
"x:-20" is done by negative value detection, but this was also
triggered with the "x:0" region (treated as "up to 1").

Also, illegal regions such as "x:-10-20" were treated as "x:-20".
This is now a hard error.

Fixes #1409
---
 hts.c   | 9 ++++++---
 tabix.1 | 4 ++--
 tbx.c   | 6 +++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/hts.c b/hts.c
index 30c0c6890..d06c10891 100644
--- a/hts.c
+++ b/hts.c
@@ -3713,14 +3713,17 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
     char *hyphen;
     *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
     if (*beg < 0) {
+        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
+            // User specified zero, but we're 1-based.
+            hts_log_error("Coordinates must be > 0");
+            return NULL;
+        }
         if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
             // interpret chr:-100 as chr:1-100
             *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
             *beg = 0;
             return s_end;
-        } else if (*hyphen == '-') {
-            *beg = 0;
-        } else {
+        } else if (*beg < -1) {
             hts_log_error("Unexpected string \"%s\" after region", hyphen);
             return NULL;
         }
diff --git a/tabix.1 b/tabix.1
index e4bf4b7cb..27428bd52 100644
--- a/tabix.1
+++ b/tabix.1
@@ -81,8 +81,8 @@ greater than that, you will need to use a CSI index.
 .SH INDEXING OPTIONS
 .TP 10
 .B -0, --zero-based
-Specify that the position in the data file is 0-based (e.g. UCSC files)
-rather than 1-based.
+Specify that the position in the data file is 0-based half-open
+(e.g. UCSC files) rather than 1-based.
 .TP
 .BI "-b, --begin " INT
 Column of start chromosomal position. [4]
diff --git a/tbx.c b/tbx.c
index 0d486ad9c..f0310a257 100644
--- a/tbx.c
+++ b/tbx.c
@@ -107,7 +107,11 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
                 if ( s==line+b ) return -1; // expected int
                 if (!(conf->preset&TBX_UCSC)) --intv->beg;
                 else ++intv->end;
-                if (intv->beg < 0) intv->beg = 0;
+                if (intv->beg < 0) {
+                    hts_log_warning("Coordinate <= 0 detected. "
+                                    "Did you forget to use the -0 option?");
+                    intv->beg = 0;
+                }
                 if (intv->end < 1) intv->end = 1;
             } else {
                 if ((conf->preset&0xffff) == TBX_GENERIC) {

From a1cd8b8335f554d5be71fa0a75d304bbe3a8a682 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Tue, 29 Mar 2022 16:04:56 +0100
Subject: [PATCH 08/79] Detect and fix invalid Type=Flag && Number!=0 header
 definitions.

Invalid definitions are fixed internally and warning such as

    [W::bcf_hdr_register_hrec] The definition of Flag "INFO/SNP" is invalid, forcing Number=0

are printed so that downstream analyses can work (e.g. `bcftools merge`).
However, output VCF headers are not fixed.

This could go one step further and also modify the headers.

See also https://github.com/samtools/bcftools/issues/1685
---
 vcf.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vcf.c b/vcf.c
index 6e3f05388..f868cc738 100644
--- a/vcf.c
+++ b/vcf.c
@@ -809,6 +809,12 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
                 *hrec->key == 'I' ? "An" : "A", hrec->key);
             var = BCF_VL_VAR;
         }
+        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
+        {
+            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
+            var = BCF_VL_FIXED;
+            num = 0;
+        }
     }
     uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
                      (var & 0xf) << 8 |

From 9bcb2d2e3c0a84b1ba6c0e7cc89a4e82c055f9fa Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 1 Apr 2022 14:22:54 +0100
Subject: [PATCH 09/79] Update to htscodecs v1.2.2

 - The name tokeniser now validates the stored length in the data
   stream matches the decoded length.
 - Make data types consistent in rans_compress_bound4x16.
 - Fix an endless loop in arith_dynamic and rans4x16pr involving
   X_STRIPE with 0 stripes.
 - Prevent memcpy(dest, NULL, 0) calls.
 - Reject attempts to duplicate non-existent name tokeniser
   streams.
 - Fix possible use of uninitialised memory in
   rans_uncompress_O1_4x16.
 - Improve error detection from fqzcomp's read_array function.
 - Reject fqzcomp parameters with inconsistent "sel" parameters.
 - Fix uninitialised access to element 4095 in r4x8 O0 arrays.
---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index c6a459a44..1395d7306 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit c6a459a4488624d5e4b2d4d642febbd55a78a9b1
+Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895

From 0785d8565b76f848d338b590ac01b03eae1d2ccf Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Tue, 5 Apr 2022 11:10:44 +0100
Subject: [PATCH 10/79] Prevent `@p masks earlier declaration` warning
 [trivial]

---
 test/test.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test.pl b/test/test.pl
index ff5601b0d..9912a1d3e 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -614,7 +614,7 @@ sub test_view
 
             ## Experimental CRAM 4.0 support.
             # SAM -> CRAM40 -> SAM
-            my @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam"
+            @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam"
                 ? (qw/fast normal small archive/)
                 : (qw/archive/);
             foreach my $profile (@p) {

From fb226846194c063744ea3efbe5cd4a490f9edf89 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Tue, 5 Apr 2022 11:42:54 +0100
Subject: [PATCH 11/79] Use constant srand() seed for repeatability

Add -s, --random-seed option and a constant default, and seed srand()
with it. Currently test_bcf_sr_sort() is the only user of rand(),
which it uses to seed test_bcf_sr_sort.pl's random number generator.
---
 test/test.pl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test.pl b/test/test.pl
index 9912a1d3e..7a396e22a 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -32,6 +32,7 @@
 use IO::Handle;
 
 my $opts = parse_params();
+srand($$opts{seed});
 
 test_bgzip($opts, 0);
 test_bgzip($opts, 4);
@@ -79,6 +80,7 @@ sub error
         "Usage: test.pl [OPTIONS]\n",
         "Options:\n",
         "   -r, --redo-outputs              Recreate expected output files.\n",
+        "   -s, --random-seed <int>         Initialise rand() with a different seed.\n",
         "   -t, --temp-dir <path>           When given, temporary files will not be removed.\n",
         "   -f, --fail-fast                 Fail-fast mode: exit as soon as a test fails.\n",
         "   -h, -?, --help                  This help message.\n",
@@ -104,12 +106,13 @@ sub safe_tempdir
 
 sub parse_params
 {
-    my $opts = { keep_files=>0, nok=>0, nfailed=>0 };
+    my $opts = { keep_files=>0, nok=>0, nfailed=>0, seed=>42 };
     my $help;
     Getopt::Long::Configure('bundling');
     my $ret = GetOptions (
             't|temp-dir:s' => \$$opts{keep_files},
             'r|redo-outputs' => \$$opts{redo_outputs},
+            's|random-seed=i' => \$$opts{seed},
             'f|fail-fast' => \$$opts{fail_fast},
             'h|?|help' => \$help
             );
@@ -974,7 +977,7 @@ sub test_bcf_sr_sort
     my ($opts, %args) = @_;
     for (my $i=0; $i<10; $i++)
     {
-        my $seed = int(rand(time));
+        my $seed = int(rand(100000000));
         my $test = 'test-bcf-sr';
         my $cmd  = "$$opts{path}/test-bcf-sr.pl -t $$opts{tmp} -s $seed";
         print "$test:\n";

From 2eddc214f26504e6d0308b654ebcf705afb6b6af Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 8 Apr 2022 15:59:52 +0100
Subject: [PATCH 12/79] Permit BAM headers between 2GB and 4GB in size once
 more.

This isn't permitted by the BAM specification, but was accepted by
earlier htslib release.  62f9909 added code to check the maximum
length.  This now has a warning at 2GB and the hard-failure at 4GB.

Fixes #1420.  Fixes samtools/samtools#1613
---
 sam.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sam.c b/sam.c
index b0ca2e974..95e8b1d42 100644
--- a/sam.c
+++ b/sam.c
@@ -339,17 +339,23 @@ int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
 
     if (h->hrecs) {
         if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
-        if (hdr_ks.l > INT32_MAX) {
+        if (hdr_ks.l > UINT32_MAX) {
             hts_log_error("Header too long for BAM format");
             free(hdr_ks.s);
             return -1;
+        } else if (hdr_ks.l > INT32_MAX) {
+            hts_log_warning("Header too long for BAM specification (>2GB)");
+            hts_log_warning("Output file may not be portable");
         }
         text = hdr_ks.s;
         l_text = hdr_ks.l;
     } else {
-        if (h->l_text > INT32_MAX) {
+        if (h->l_text > UINT32_MAX) {
             hts_log_error("Header too long for BAM format");
             return -1;
+        } else if (h->l_text > INT32_MAX) {
+            hts_log_warning("Header too long for BAM specification (>2GB)");
+            hts_log_warning("Output file may not be portable");
         }
         text = h->text;
         l_text = h->l_text;

From e51f72f0a025ffa03806a5fd6559e03b4c6438c9 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 12 Apr 2022 12:16:01 +0100
Subject: [PATCH 13/79] Permit MM tags containing "." and "?" suffixes.

These define explicit vs implicit coordinates.  They are now part of
the MM specification, but we don't do anything with this data yet.
This PR simply permits them to be parsed without choking, and ignores
the additional markup.  A subsequent PR will improve on this.

Fixes #1418
---
 sam.c                          |  20 ++++++-
 test/base_mods/MM-explicit.out | 100 +++++++++++++++++++++++++++++++++
 test/base_mods/MM-explicit.sam |  13 +++++
 test/base_mods/base-mods.tst   |   7 ++-
 4 files changed, 134 insertions(+), 6 deletions(-)
 create mode 100644 test/base_mods/MM-explicit.out
 create mode 100644 test/base_mods/MM-explicit.sam

diff --git a/sam.c b/sam.c
index 95e8b1d42..04f3435f7 100644
--- a/sam.c
+++ b/sam.c
@@ -6162,7 +6162,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
     int mod_num = 0;
     while (*cp) {
         for (; *cp; cp++) {
-            // cp should be [ACGTNU][+-][^,]*(,\d+)*;
+            // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
             unsigned char btype = *cp++;
 
             if (btype != 'A' && btype != 'C' &&
@@ -6182,18 +6182,32 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
             char *ms = cp, *me; // mod code start and end
             char *cp_end = NULL;
             int chebi = 0;
-            if (isdigit(*cp)) {
+            if (isdigit_c(*cp)) {
                 chebi = strtol(cp, &cp_end, 10);
                 cp = cp_end;
                 ms = cp-1;
             } else {
-                while (*cp && *cp != ',' && *cp != ';')
+                while (*cp && isalpha_c(*cp))
                     cp++;
                 if (*cp == '\0')
                     return -1;
             }
             me = cp;
 
+            // Optional explicit vs implicit marker.
+            // Right now we ignore this field.  A proper API for
+            // querying it will follow later.
+            if (*cp == '.') {
+                // implicit = 1;
+                cp++;
+            } else if (*cp == '?') {
+                // implicit = 0;
+                cp++;
+            } else if (*cp != ',' && *cp != ';') {
+                // parse error
+                return -1;
+            }
+
             long delta;
             int n = 0; // nth symbol in a multi-mod string
             int stride = me-ms;
diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out
new file mode 100644
index 000000000..05e2828c4
--- /dev/null
+++ b/test/base_mods/MM-explicit.out
@@ -0,0 +1,100 @@
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+11	T	
+12	A	
+13	C	
+14	C	C+m160 C+h20 
+15	G	
+16	C	
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+14	C	C+m160 C+h20 
+
+===
+
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+11	T	
+12	A	
+13	C	C+m10 C+h5 
+14	C	C+m160 C+h20 
+15	G	
+16	C	C+m10 C+h5 
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+13	C	C+m10 C+h5 
+14	C	C+m160 C+h20 
+16	C	C+m10 C+h5 
+
+===
+
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m200 C+h10 
+10	C	C+h170 
+11	T	
+12	A	
+13	C	C+h5 
+14	C	C+m160 C+h20 
+15	G	
+16	C	C+h5 
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+9	C	C+m200 C+h10 
+10	C	C+h170 
+13	C	C+h5 
+14	C	C+m160 C+h20 
+16	C	C+h5 
+
+===
+
diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam
new file mode 100644
index 000000000..e4e37103d
--- /dev/null
+++ b/test/base_mods/MM-explicit.sam
@@ -0,0 +1,13 @@
+@CO	Testing explicit vs implicit base modifications.
+@CO	This covers the case where a lack of a signal could be either
+@CO	implicitly assumed to be no-mod (default) or assumed to be
+@CO	unchecked and require an explicit statement to indicate it was
+@CO	looked at and no base modification was observed.
+@CO	
+@CO	  0  1   23  45 6     78
+@CO	ATCATCATTCCTACCGCTATAGCCT
+@CO	  .  .   m.  .m .     ..  m
+@CO	  ?  ?   .h  .. .     ??  h
+r1	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh,2,0,1;	Ml:B:C,200,10,50,170,160,20
+r2	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh?,2,0,0,0,0;	Ml:B:C,200,10,50,170,10,5,160,20,10,5
+r3	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0;	Ml:B:C,200,160,10,170,5,20,5
diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst
index 865a539c7..c64128ee2 100644
--- a/test/base_mods/base-mods.tst
+++ b/test/base_mods/base-mods.tst
@@ -33,9 +33,10 @@
 # samtools binary. This can be useful for testing older versions.
 
 # Test files from SAM spec
-P MM-chebi.out  $test_mod MM-chebi.sam
-P MM-double.out $test_mod MM-double.sam
-P MM-multi.out  $test_mod MM-multi.sam
+P MM-chebi.out     $test_mod MM-chebi.sam
+P MM-double.out    $test_mod MM-double.sam
+P MM-multi.out     $test_mod MM-multi.sam
+P MM-explicit.out  $test_mod MM-explicit.sam
 
 # Pileup testing
 P MM-pileup.out $pileup_mod < MM-pileup.sam

From e6065cb6b777333077b1ee6e0e7e50974dd32739 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 27 Apr 2022 10:30:21 +0100
Subject: [PATCH 14/79] Improve error messages for CRAM reference mismatches.

If the user specifies the wrong reference, the CRAM slice header
MD5sum checks fail.  We now report the SQ line M5 string too so it is
possible to validate against the whole chr in the ref.fa file.

The error message has also been improved to report the reference name
instead of #num.

Finally, we now hint at the likely cause, which counters the
misleading samtools supplied error of "truncated or corrupt" file.

See samtools/samtools#1640.
---
 cram/cram_decode.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cram/cram_decode.c b/cram/cram_decode.c
index b352fc633..51f1b765c 100644
--- a/cram/cram_decode.c
+++ b/cram/cram_decode.c
@@ -2423,10 +2423,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
         if ((!s->ref && s->hdr->ref_base_id < 0)
             || memcmp(digest, s->hdr->md5, 16) != 0) {
             char M[33];
-            hts_log_error("MD5 checksum reference mismatch at #%d:%d-%d",
-                          ref_id, s->ref_start, s->ref_end);
-            hts_log_error("CRAM: %s", md5_print(s->hdr->md5, M));
-            hts_log_error("Ref : %s", md5_print(digest, M));
+            const char *rname = sam_hdr_tid2name(sh, ref_id);
+            if (!rname) rname="?"; // cannot happen normally
+            hts_log_error("MD5 checksum reference mismatch at %s:%d-%d",
+                          rname, s->ref_start, s->ref_end);
+            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
+            hts_log_error("Ref   : %s", md5_print(digest, M));
+            kstring_t ks = KS_INITIALIZE;
+            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
+                hts_log_error("@SQ M5: %s", ks.s);
+            hts_log_error("Please check the reference given is correct");
+            ks_free(&ks);
             return -1;
         }
     }

From d7cc10de075735d07eb8da0538cbdc0f331f7bd1 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 27 Apr 2022 11:39:01 +0100
Subject: [PATCH 15/79] Make test-logging less pedantic.

It's trying to spot error messages starting with lowercase letters,
but in doing so forbids things like "@SQ" as it's not capital.
---
 test/test-logging.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test-logging.pl b/test/test-logging.pl
index 1040b0e47..2f22560b5 100755
--- a/test/test-logging.pl
+++ b/test/test-logging.pl
@@ -33,7 +33,7 @@ sub check_log_message
   my ($message, $filename, $line_num) = @_;
   $log_message_count++;
 
-  unless ($message =~ /^\"([A-Z]|%s)/)
+  unless ($message =~ /^\"([A-Z!-@]|%s)/)
   {
     print "$filename line $line_num:\n";
     print "Log message should begin with a capital letter: $message.\n";

From e868dea9ebd9b5d4d47bbeb1b77cd20267abf00e Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 3 Nov 2021 14:45:14 +0000
Subject: [PATCH 16/79] Improve support for MM .? modifiers.

The previous commit permitted these to exist, but didn't make the data
available to the caller.

This extends the API with additional queries to distinguish the
specifics about the modification types present.
---
 htslib/sam.h                     |  38 ++++++++++++
 sam.c                            |  53 ++++++++++++++--
 test/base_mods/MM-chebi.out      |   1 +
 test/base_mods/MM-double.out     |   1 +
 test/base_mods/MM-explicit-x.out | 103 +++++++++++++++++++++++++++++++
 test/base_mods/MM-explicit.out   |   3 +
 test/base_mods/MM-explicit.sam   |  22 +++++--
 test/base_mods/MM-multi.out      |   2 +
 test/base_mods/base-mods.tst     |   9 +--
 test/test_mod.c                  |  97 ++++++++++++++++++++++++++---
 10 files changed, 309 insertions(+), 20 deletions(-)
 create mode 100644 test/base_mods/MM-explicit-x.out

diff --git a/htslib/sam.h b/htslib/sam.h
index 45dd51f0a..a6e64fbb2 100644
--- a/htslib/sam.h
+++ b/htslib/sam.h
@@ -2271,6 +2271,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
                      hts_base_mod *mods, int n_mods);
 
 
+/// Returns data about a specific modification type for the alignment record.
+/**
+ * @param b          BAM alignment record
+ * @param state      The base modification state pointer.
+ * @param code       Modification code.  If positive this is a character code,
+ *                   if negative it is a -ChEBI code.
+ *
+ * @param strand     Boolean for top (0) or bottom (1) strand
+ * @param implicit   Boolean for whether unlisted positions should be
+ *                   implicitly assumed to be unmodified, or require an
+ *                   explicit score and should be considered as unknown.
+ *                   Returned.
+ * @param canonical  Canonical base type associated with this modification
+ *                   Returned.
+ *
+ * @return 0 on success or -1 if not found.  The strand, implicit and canonical
+ * fields are filled out if passed in as non-NULL pointers.
+ */
+HTSLIB_EXPORT
+int bam_mods_query_type(hts_base_mod_state *state, int code,
+                        int *strand, int *implicit, char *canonical);
+
+/// Returns the list of base modification codes provided for this
+/// alignment record as an array of character codes (+ve) or ChEBI numbers
+/// (negative).
+/*
+ * @param b          BAM alignment record
+ * @param state      The base modification state pointer.
+ * @param ntype      Filled out with the number of array elements returned
+ *
+ * @return the type array, with *ntype filled out with the size.
+ *         The array returned should not be freed.
+ *         It is a valid pointer until the state is freed using
+ *         hts_base_mod_free().
+ */
+HTSLIB_EXPORT
+int *bam_mods_recorded(hts_base_mod_state *state, int *ntype);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/sam.c b/sam.c
index 04f3435f7..dd1b7d9fd 100644
--- a/sam.c
+++ b/sam.c
@@ -6092,6 +6092,7 @@ struct hts_base_mod_state {
     char *MMend[MAX_BASE_MOD];  // end of pos-delta string
     uint8_t *ML[MAX_BASE_MOD];  // next qual
     int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
+    int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
     int seq_pos;                // current position along sequence
     int nmods;                  // used array size (0 to MAX_BASE_MOD-1).
 };
@@ -6160,6 +6161,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
 
     char *cp = (char *)mm+1;
     int mod_num = 0;
+    int implicit = 1;
     while (*cp) {
         for (; *cp; cp++) {
             // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
@@ -6192,16 +6194,15 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
                 if (*cp == '\0')
                     return -1;
             }
+
             me = cp;
 
-            // Optional explicit vs implicit marker.
-            // Right now we ignore this field.  A proper API for
-            // querying it will follow later.
+            // Optional explicit vs implicit marker
             if (*cp == '.') {
-                // implicit = 1;
+                // default is implicit = 1;
                 cp++;
             } else if (*cp == '?') {
-                // implicit = 0;
+                implicit = 0;
                 cp++;
             } else if (*cp != ',' && *cp != ';') {
                 // parse error
@@ -6257,6 +6258,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
                 state->strand   [mod_num] = (strand == '-');
                 state->canonical[mod_num] = btype;
                 state->MLstride [mod_num] = stride;
+                state->implicit [mod_num] = implicit;
 
                 state->MMcount  [mod_num] = delta;
                 if (b->core.flag & BAM_FREVERSE) {
@@ -6473,3 +6475,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
 
     return r;
 }
+
+/*
+ * Returns the list of base modification codes provided for this
+ * alignment record as an array of character codes (+ve) or ChEBI numbers
+ * (negative).
+ *
+ * Returns the array, with *ntype filled out with the size.
+ *         The array returned should not be freed.
+ *         It is a valid pointer until the state is freed using
+ *         hts_base_mod_free().
+ */
+int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
+    *ntype = state->nmods;
+    return state->type;
+}
+
+/*
+ * Returns data about a specific modification type for the alignment record.
+ * Code is either positive (eg 'm') or negative for ChEBI numbers.
+ *
+ * Return 0 on success or -1 if not found.  The strand, implicit and canonical
+ * fields are filled out if passed in as non-NULL pointers.
+ */
+int bam_mods_query_type(hts_base_mod_state *state, int code,
+                        int *strand, int *implicit, char *canonical) {
+    // Find code entry
+    int i;
+    for (i = 0; i < state->nmods; i++) {
+        if (state->type[i] == code)
+            break;
+    }
+    if (i == state->nmods)
+        return -1;
+
+    // Return data
+    if (strand)    *strand    = state->strand[i];
+    if (implicit)  *implicit  = state->implicit[i];
+    if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
+
+    return 0;
+}
diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out
index cefdc545c..a6e7654cf 100644
--- a/test/base_mods/MM-chebi.out
+++ b/test/base_mods/MM-chebi.out
@@ -35,6 +35,7 @@
 34	C	C+m204 C+(76792)33 
 35	A	
 ---
+Present: m #-76792 n
 6	C	C+m102 
 15	N	N+n212 
 17	C	C+m128 
diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out
index 82d086a2f..e21ae314e 100644
--- a/test/base_mods/MM-double.out
+++ b/test/base_mods/MM-double.out
@@ -35,6 +35,7 @@
 34	A	
 35	T	
 ---
+Present: m m o
 1	G	G-m115 
 7	C	C+m128 
 12	G	G-m141 
diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out
new file mode 100644
index 000000000..4abedc719
--- /dev/null
+++ b/test/base_mods/MM-explicit-x.out
@@ -0,0 +1,103 @@
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m.200 C+h.10 
+10	C	C+m.50 C+h.170 
+11	T	
+12	A	
+13	C	
+14	C	C+m.160 C+h.20 
+15	G	
+16	C	
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+Present: m h
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+14	C	C+m160 C+h20 
+
+===
+
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m?200 C+h?10 
+10	C	C+m?50 C+h?170 
+11	T	
+12	A	
+13	C	C+m?10 C+h?5 
+14	C	C+m?160 C+h?20 
+15	G	
+16	C	C+m?10 C+h?5 
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+Present: m h
+9	C	C+m200 C+h10 
+10	C	C+m50 C+h170 
+13	C	C+m10 C+h5 
+14	C	C+m160 C+h20 
+16	C	C+m10 C+h5 
+
+===
+
+0	A	
+1	T	
+2	C	
+3	A	
+4	T	
+5	C	
+6	A	
+7	T	
+8	T	
+9	C	C+m.200 C+h?10 
+10	C	C+h?170 
+11	T	
+12	A	
+13	C	C+h?5 
+14	C	C+m.160 C+h?20 
+15	G	
+16	C	C+h?5 
+17	T	
+18	A	
+19	T	
+20	A	
+21	G	
+22	C	
+23	C	
+24	T	
+---
+Present: m h
+9	C	C+m200 C+h10 
+10	C	C+h170 
+13	C	C+h5 
+14	C	C+m160 C+h20 
+16	C	C+h5 
+
+===
+
diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out
index 05e2828c4..f28b25f83 100644
--- a/test/base_mods/MM-explicit.out
+++ b/test/base_mods/MM-explicit.out
@@ -24,6 +24,7 @@
 23	C	
 24	T	
 ---
+Present: m h
 9	C	C+m200 C+h10 
 10	C	C+m50 C+h170 
 14	C	C+m160 C+h20 
@@ -56,6 +57,7 @@
 23	C	
 24	T	
 ---
+Present: m h
 9	C	C+m200 C+h10 
 10	C	C+m50 C+h170 
 13	C	C+m10 C+h5 
@@ -90,6 +92,7 @@
 23	C	
 24	T	
 ---
+Present: m h
 9	C	C+m200 C+h10 
 10	C	C+h170 
 13	C	C+h5 
diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam
index e4e37103d..e85afa293 100644
--- a/test/base_mods/MM-explicit.sam
+++ b/test/base_mods/MM-explicit.sam
@@ -4,10 +4,24 @@
 @CO	unchecked and require an explicit statement to indicate it was
 @CO	looked at and no base modification was observed.
 @CO	
-@CO	  0  1   23  45 6     78
-@CO	ATCATCATTCCTACCGCTATAGCCT
-@CO	  .  .   m.  .m .     ..  m
-@CO	  ?  ?   .h  .. .     ??  h
+@CO	ATCATCATTCCTACCGCTATAGCCT  r1; implicit
+@CO	  -  -   ..  -. -     --
+@CO	         Mm   M
+@CO	  -  -   ..  -. -     --
+@CO	         hH   h
+@CO	
+@CO	ATCATCATTCCTACCGCTATAGCCT  r2; explicit to a small region
+@CO	  -  -   ??  ?? ?     --
+@CO	         Mm  mM m
+@CO	  -  -   ??  ?? ?     --
+@CO	         hH  hh h
+@CO	
+@CO	ATCATCATTCCTACCGCTATAGCCT  r3; mixture
+@CO	  -  -   .   -. -     --
+@CO	         M    M
+@CO       -  -   ??  ?? ?     --
+@CO	         hH  hh h     --
+@CO	
 r1	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh,2,0,1;	Ml:B:C,200,10,50,170,160,20
 r2	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh?,2,0,0,0,0;	Ml:B:C,200,10,50,170,10,5,160,20,10,5
 r3	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0;	Ml:B:C,200,160,10,170,5,20,5
diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out
index 23c98d97b..e411a81ee 100644
--- a/test/base_mods/MM-multi.out
+++ b/test/base_mods/MM-multi.out
@@ -35,6 +35,7 @@
 34	C	C+m230 C+h6 
 35	A	
 ---
+Present: m h n
 6	C	C+m128 
 15	N	N+n215 
 17	C	C+m153 
@@ -83,6 +84,7 @@
 34	C	C+m204 C+h31 
 35	A	
 ---
+Present: m h n
 6	C	C+m77 C+h159 
 15	N	N+n240 
 17	C	C+m103 C+h133 
diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst
index c64128ee2..3809c0e6e 100644
--- a/test/base_mods/base-mods.tst
+++ b/test/base_mods/base-mods.tst
@@ -33,10 +33,11 @@
 # samtools binary. This can be useful for testing older versions.
 
 # Test files from SAM spec
-P MM-chebi.out     $test_mod MM-chebi.sam
-P MM-double.out    $test_mod MM-double.sam
-P MM-multi.out     $test_mod MM-multi.sam
-P MM-explicit.out  $test_mod MM-explicit.sam
+P MM-chebi.out       $test_mod    MM-chebi.sam
+P MM-double.out      $test_mod    MM-double.sam
+P MM-multi.out       $test_mod    MM-multi.sam
+P MM-explicit.out    $test_mod    MM-explicit.sam
+P MM-explicit-x.out  $test_mod -x MM-explicit.sam
 
 # Pileup testing
 P MM-pileup.out $pileup_mod < MM-pileup.sam
diff --git a/test/test_mod.c b/test/test_mod.c
index aade3733c..f6f5b0718 100644
--- a/test/test_mod.c
+++ b/test/test_mod.c
@@ -22,6 +22,52 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.  */
 
+/*
+This tests multiple APIs.  The simplest is to parse the MM/ML tags with
+bam_parse_basemod and then call bam_mods_at_next_pos once for each base in
+the bam sequence to check for modifications.
+
+Ie:
+
+    hts_base_mod_state *m = hts_base_mod_state_alloc();
+    bam_parse_basemod(b, m); // b=bam1_t pointer
+    hts_base_mod mods[5];
+    for (i = 0; i < b->core.l_qseq; i++) {
+        n = bam_mods_at_next_pos(b, m, mods, 5);
+        for (j = 0; j < n && j < 5; j++) {
+            // Report 'n'th mod at seq pos 'i'.
+            // mods[j].modified_base holds the base mod itself, with
+            // mods[j].canonical_base, mods[j].strand and mods[j].qual
+            // also present in hts_base_mod struct.
+            // ...
+        }
+    }
+    hts_base_mod_state_free(m);
+
+The extended mode has the same loop above, but calls bam_mods_query_type
+to return additional meta-data including the strand, canonical base and
+whether the base modification is recorded implicitly or explicitly:
+
+            int ret = bam_mods_query_type(m, mods[j].modified_base,
+                                          &m_strand, &m_implicit,
+                                          &m_canonical);
+
+Looping over every base in the sequence is not particularly efficient
+however unless this fits your natural processing order.  The alternative
+is to call bam_next_base_mod to iterate only over modified locations:
+
+    hts_base_mod_state *m = hts_base_mod_state_alloc();
+    bam_parse_basemod(b, m); // b=bam1_t pointer
+    hts_base_mod mods[5];
+    while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) {
+        for (j = 0; j < n && j < 5; j++) {
+            // Report 'n'th mod at sequence position 'pos'
+        }
+    }
+    hts_base_mod_state_free(m);
+
+*/
+
 #include <config.h>
 #include <stdio.h>
 
@@ -41,6 +87,14 @@ static char *code(int id) {
 
 int main(int argc, char **argv) {
     char out[1024] = {0};
+    int extended = 0;
+
+    if (argc > 1 && strcmp(argv[1], "-x") == 0) {
+        extended = 1;
+        argv++;
+        argc--;
+    }
+
     if (argc < 2)
         return 1;
 
@@ -69,12 +123,31 @@ int main(int argc, char **argv) {
             n = bam_mods_at_next_pos(b, m, mods, 5);
             lp += sprintf(lp, "%d\t%c\t",
                           i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]);
-            for (j = 0; j < n && j < 5; j++)
-                lp += sprintf(lp, "%c%c%s%d ",
-                              mods[j].canonical_base,
-                              "+-"[mods[j].strand],
-                              code(mods[j].modified_base),
-                              mods[j].qual);
+            for (j = 0; j < n && j < 5; j++) {
+                if (extended) {
+                    int m_strand, m_implicit;
+                    char m_canonical;
+                    int ret = bam_mods_query_type(m, mods[j].modified_base,
+                                                  &m_strand, &m_implicit,
+                                                  &m_canonical);
+                    if (ret < 0 ||
+                        m_canonical != mods[j].canonical_base ||
+                        m_strand    != mods[j].strand)
+                        goto err;
+                    lp += sprintf(lp, "%c%c%s%c%d ",
+                                  mods[j].canonical_base,
+                                  "+-"[mods[j].strand],
+                                  code(mods[j].modified_base),
+                                  "?."[m_implicit],
+                                  mods[j].qual);
+                } else {
+                    lp += sprintf(lp, "%c%c%s%d ",
+                                  mods[j].canonical_base,
+                                  "+-"[mods[j].strand],
+                                  code(mods[j].modified_base),
+                                  mods[j].qual);
+                }
+            }
             *lp++ = '\n';
             *lp++ = 0;
 
@@ -88,17 +161,27 @@ int main(int argc, char **argv) {
 
         bam_parse_basemod(b, m);
 
+        // List possible mod choices.
+        int *all_mods;
+        int all_mods_n = 0;
+        all_mods = bam_mods_recorded(m, &all_mods_n);
+        printf("Present:");
+        for (i = 0; i < all_mods_n; i++)
+            printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]);
+        putchar('\n');
+
         int pos;
         while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) {
             char line[8192]={0}, *lp = line;
             lp += sprintf(lp, "%d\t%c\t", pos,
                           seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]);
-            for (j = 0; j < n && j < 5; j++)
+            for (j = 0; j < n && j < 5; j++) {
                 lp += sprintf(lp, "%c%c%s%d ",
                               mods[j].canonical_base,
                               "+-"[mods[j].strand],
                               code(mods[j].modified_base),
                               mods[j].qual);
+            }
             *lp++ = '\n';
             *lp++ = 0;
 

From 3c44c0b9eabcb1f6e111ec1b94e287155085e523 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 5 May 2022 16:19:12 +0100
Subject: [PATCH 17/79] Switch cirrus ubuntu image to ubuntu:latest

Done to fix a problem where clang fails to install due to an
inconsistency in the apt sources used by the ubuntu kinetic (a.k.a
devel) Docker image.  This means an update to clang via the
kinetic-proposed source makes it uninstallable on that image
until the proposed change makes it to the kinetic one.

Switching to ubuntu:latest means we won't be quite as leading-edge
but it's less likely to break unexpectedly.
---
 .cirrus.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 1ba352063..2740ce05e 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -84,8 +84,8 @@ gcc_task:
 ubuntu_task:
   name: ubuntu-clang
   container:
-    #image: ubuntu:latest   # use << : *LIBDEFLATE
-    image: ubuntu:devel
+    image: ubuntu:latest
+    # image: ubuntu:devel
     cpu: 2
     memory: 1G
 

From 46c56fcc2300dcac123edc670689e3bd2f241df6 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 4 May 2022 17:13:49 +0100
Subject: [PATCH 18/79] Fix buffer overrun in bam_plp_insertion_mod.

This attempted to grow memory by the maximum amount of space a base
modification would take up, but due to a misunderstanding of kstring
it kept adding this to the original size rather than actually growing
the allocated size.

(Probably) fixes samtools/samtools#1652
---
 sam.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sam.c b/sam.c
index dd1b7d9fd..865b55fa6 100644
--- a/sam.c
+++ b/sam.c
@@ -5306,6 +5306,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p,
                 hts_base_mod mod[256];
                 if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
                                                 m, mod, 256)) > 0) {
+                    int o_indel = indel;
                     if (ks_resize(ins, ins->l + nm*16+3) < 0)
                         return -1;
                     ins->s[indel++] = '[';
@@ -5329,6 +5330,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p,
                                              qual);
                     }
                     ins->s[indel++] = ']';
+                    ins->l += indel - o_indel; // grow by amount we used
                 }
             }
             break;

From 7e2df7ea27138058bae94baaf28cf2ca12f1ec0d Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 19 May 2022 16:11:56 +0100
Subject: [PATCH 19/79] Ensure str_class is set before using it in
 bcf_hdr_get_hrec

gcc-12.1 produced a warning that NULL could be passed to
strcmp() via str_class.  I'm not sure if that can actually
happen, but just in case add a check.
---
 vcf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vcf.c b/vcf.c
index f868cc738..e1d386c7a 100644
--- a/vcf.c
+++ b/vcf.c
@@ -930,6 +930,8 @@ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, co
     }
     else if ( type==BCF_HL_STR )
     {
+        if (!str_class)
+            return NULL;
         for (i=0; i<hdr->nhrec; i++)
         {
             if ( hdr->hrec[i]->type!=type ) continue;

From 5a302ffbb4f76b30a3fe555433581d10da47cf30 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 25 Apr 2022 11:21:21 +0100
Subject: [PATCH 20/79] Update htscodecs including associated Makefile changes

Update htscodecs to bring in the rANS 32x16 codecs

Add new htscodecs source files and dependencies into htslib
makefiles.  Some htscodecs functions have changed name slightly
so a couple of cram source files are updated to reflect that.

These changes are enough to build the non-SIMD versions of
the new codecs, but don't enable the accelerated versions yet.
---
 Makefile             | 13 +++++++++----
 cram/cram_codecs.c   | 16 ++++++++--------
 cram/cram_io.c       |  4 ++--
 htscodecs            |  2 +-
 htscodecs_bundled.mk | 11 ++++++++++-
 5 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 0871580ce..e52d9128b 100644
--- a/Makefile
+++ b/Makefile
@@ -435,14 +435,19 @@ cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/st
 thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h)
 
 htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h)
-htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h)
+htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h)
 htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h)
 htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h)
-htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/rANS_static32x16pr.o htscodecs/htscodecs/rANS_static32x16pr.pico: htscodecs/htscodecs/rANS_static32x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: htscodecs/htscodecs/rANS_static32x16pr_avx2.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_permute_h)
+htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: htscodecs/htscodecs/rANS_static32x16pr_avx512.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/rANS_static32x16pr_neon.o htscodecs/htscodecs/rANS_static32x16pr_neon.pico: htscodecs/htscodecs/rANS_static32x16pr_neon.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: htscodecs/htscodecs/rANS_static32x16pr_sse4.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) $(htscodecs_rANS_static32x16pr_h)
 htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h)
 htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h)
-htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h)
-
+htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h)
 
 bgzip: bgzip.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread
diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c
index 9f112863e..33e1b5bf8 100644
--- a/cram/cram_codecs.c
+++ b/cram/cram_codecs.c
@@ -2030,10 +2030,10 @@ static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
     int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
     if (!(b->data = malloc(out_sz)))
         return -1;
-    rle_decode(lit_dat, lit_sz,
-               len_dat+nb, len_sz-nb,
-               rle_syms, rle_nsyms,
-               b->data, &out_sz);
+    hts_rle_decode(lit_dat, lit_sz,
+                   len_dat+nb, len_sz-nb,
+                   rle_syms, rle_nsyms,
+                   b->data, &out_sz);
     b->uncomp_size = out_sz;
 
     return 0;
@@ -2200,10 +2200,10 @@ int cram_xrle_encode_flush(cram_codec *c) {
 
     int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
 
-    out_lit = rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
-                         out_len+nb, &out_len_size,
-                         rle_syms, &rle_nsyms,
-                         NULL, &out_lit_size);
+    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
+                             out_len+nb, &out_len_size,
+                             rle_syms, &rle_nsyms,
+                             NULL, &out_lit_size);
     out_len_size += nb;
 
 
diff --git a/cram/cram_io.c b/cram/cram_io.c
index c9dcb5014..60a568b7b 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -1722,7 +1722,7 @@ int cram_uncompress_block(cram_block *b) {
 
     case TOK3: {
         uint32_t out_len;
-        uint8_t *cp = decode_names(b->data, b->comp_size, &out_len);
+        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
         if (!cp)
             return -1;
         b->orig_method = TOK3;
@@ -1875,7 +1875,7 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
         int lev = level;
         if (method == TOK3 && lev > 3)
             lev = 3;
-        uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL);
+        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
         *out_size = out_len;
         return (char *)cp;
     }
diff --git a/htscodecs b/htscodecs
index 1395d7306..65bb347f6 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895
+Subproject commit 65bb347f6b0ea7f4a00cb768b3d8004f24ae03c3
diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk
index 7242e210b..4a862f3d1 100644
--- a/htscodecs_bundled.mk
+++ b/htscodecs_bundled.mk
@@ -28,9 +28,15 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \
         $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \
         $(HTSPREFIX)htscodecs/htscodecs/pack.c \
         $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \
+	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \
+	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \
+	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \
+	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \
         $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \
         $(HTSPREFIX)htscodecs/htscodecs/rle.c \
-        $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c
+        $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c \
+	$(HTSPREFIX)htscodecs/htscodecs/utils.c
+
 
 HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o)
 
@@ -49,8 +55,11 @@ htscodecs_varint_h = htscodecs/htscodecs/varint.h
 htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h
 htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h
 htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h)
+htscodecs_permute_h = htscodecs/htscodecs/permute.h
 htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h
 htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h
+htscodecs_rANS_static16_int_h = htscodecs/htscodecs/rANS_static16_int.h $(htscodecs_varint_h) $(htscodecs_utils_h)
+htscodecs_rANS_static32x16pr_h = htscodecs/htscodecs/rANS_static32x16pr.h
 htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h)
 htscodecs_utils_h = htscodecs/htscodecs/utils.h
 htscodecs_version_h = htscodecs/htscodecs/version.h

From b0cc5bdd86c6aebc9684f22b28fcf6b079192bb9 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 25 Apr 2022 12:08:28 +0100
Subject: [PATCH 21/79] Add configure changes to enable htscodecs SIMD code

Adds the checks necessary to detect x86_64 SIMD support and
turn it on the htscodecs if it's available.

As ssse3, popcnt and sse4.1 are used together, they're tested
for as a group.
---
 Makefile                    | 14 ++++++++--
 config.mk.in                |  5 ++++
 configure.ac                | 51 +++++++++++++++++++++++++++++++++++
 m4/ax_check_compile_flag.m4 | 53 +++++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 m4/ax_check_compile_flag.m4

diff --git a/Makefile b/Makefile
index e52d9128b..68734026d 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ CPPFLAGS =
 #CFLAGS   = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600
 CFLAGS   = -g -Wall -O2 -fvisibility=hidden
 EXTRA_CFLAGS_PIC = -fpic
+TARGET_CFLAGS =
 LDFLAGS  = -fvisibility=hidden
 LIBS     = $(htslib_default_libs)
 
@@ -161,10 +162,10 @@ config_vars.h:
 .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so
 
 .c.o:
-	$(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $<
+	$(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $<
 
 .c.pico:
-	$(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $<
+	$(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $<
 
 
 LIBHTS_OBJS = \
@@ -215,6 +216,10 @@ NONCONFIGURE_OBJS = hfile_libcurl.o
 PLUGIN_EXT  =
 PLUGIN_OBJS =
 
+HTS_CFLAGS_AVX2 =
+HTS_CFLAGS_AVX512 =
+HTS_CFLAGS_SSE4 =
+
 cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h)
 cram_io_h = cram/cram_io.h $(cram_misc_h)
 cram_misc_h = cram/misc.h
@@ -449,6 +454,11 @@ htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.
 htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h)
 htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h)
 
+# Extra CFLAGS for specific files
+htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX2)
+htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX512)
+htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: TARGET_CFLAGS = $(HTS_CFLAGS_SSE4)
+
 bgzip: bgzip.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread
 
diff --git a/config.mk.in b/config.mk.in
index f8decf0a2..35392bf0d 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -112,3 +112,8 @@ LDFLAGS += $(noplugin_LDFLAGS)
 LIBS += $(noplugin_LIBS)
 
 endif
+
+# Extra CFLAGS for specific files
+HTS_CFLAGS_AVX2 = @hts_cflags_avx2@
+HTS_CFLAGS_AVX512 = @hts_cflags_avx512@
+HTS_CFLAGS_SSE4 = @hts_cflags_sse4@
diff --git a/configure.ac b/configure.ac
index 1216ecc21..9c8cca480 100644
--- a/configure.ac
+++ b/configure.ac
@@ -30,6 +30,7 @@ AC_CONFIG_SRCDIR(hts.c)
 AC_CONFIG_HEADERS(config.h)
 
 m4_include([m4/hts_prog_cc_warnings.m4])
+m4_include([m4/ax_check_compile_flag.m4])
 m4_include([m4/hts_hide_dynamic_syms.m4])
 m4_include([m4/pkg.m4])
 
@@ -69,6 +70,56 @@ dnl Flags to treat warnings as errors.  These need to be applied to CFLAGS
 dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS)
 HTS_PROG_CC_WERROR(hts_late_cflags)
 
+dnl Check for various compiler flags to enable SIMD features
+dnl Options for rANS32x16 sse4.1 version
+AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [
+  hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1"
+  AC_SUBST([hts_cflags_sse4])
+  AC_DEFINE([HAVE_SSSE3],1,
+            [Defined to 1 if the compiler can issue SSSE3 instructions.])
+  AC_DEFINE([HAVE_POPCNT],1,
+            [Defined to 1 if the compiler can issue popcnt instructions.])
+  AC_DEFINE([HAVE_SSE4_1],1,
+            [Defined to 1 if the compiler can issue SSE4.1 instructions.])
+  ], [], [], [AC_LANG_PROGRAM([[
+    #include "x86intrin.h"
+  ]],[[
+    unsigned int i = _mm_popcnt_u32(1);
+    __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1);
+    __m128i c = _mm_max_epu32(a, b);
+    b = _mm_shuffle_epi8(a, c);
+    return *((char *) &b);
+  ]])])
+
+dnl Options for rANS32x16 avx2 version
+AX_CHECK_COMPILE_FLAG([-mavx2], [
+  hts_cflags_avx2="-mavx2"
+  AC_SUBST([hts_cflags_avx2])
+  AC_DEFINE([HAVE_AVX2],1,
+            [Defined to 1 if the compiler can issue AVX2 instructions.])
+  ], [], [], [AC_LANG_PROGRAM([[
+  #include "x86intrin.h"
+  ]],[[
+    __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    __m256i b = _mm256_add_epi32(a, a);
+    return *((char *) &b);
+  ]])])
+
+dnl Options for rANS32x16 avx512 version
+AX_CHECK_COMPILE_FLAG([-mavx512f], [
+  hts_cflags_avx512="-mavx512f"
+  AC_SUBST([hts_cflags_avx512])
+  AC_DEFINE([HAVE_AVX512],1,
+            [Defined to 1 if the compiler can issue AVX512 instructions.])
+  ], [], [], [AC_LANG_PROGRAM([[
+    #include "x86intrin.h"
+  ]],[[
+    __m512i a = _mm512_set1_epi32(1);
+    __m512i b = _mm512_add_epi32(a, a);
+    return *((char *) &b);
+  ]])])
+
+
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
 dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
 dnl for pkg-config...
diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4
new file mode 100644
index 000000000..bd753b34d
--- /dev/null
+++ b/m4/ax_check_compile_flag.m4
@@ -0,0 +1,53 @@
+# ===========================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 6
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_VAR_IF(CACHEVAR,yes,
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS

From 587b4d05600d6708295c3bce6db519d887368636 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 20 May 2022 15:46:25 +0100
Subject: [PATCH 22/79] Make htscodecs SIMD code build without configure

For the benefit of htslib embeds that don't want to create and run
a configure script.  Adds a small script that does a similar
job by probing a few compiler options and then outputs makefile
lines to set variables if they succeed.  These lines are added
to the default 'htscodecs.mk' file that gets built if configure
hasn't already made one.  Adding them here means the probing
will be remembered until the next "make distclean".

The script fragment that builds the default 'config.h' checks
to see if the variables have been set, and if so adds the
appropriate 'HAVE_' lines for the feature.
---
 Makefile        | 21 +++++++++--
 hts_probe_cc.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 4 deletions(-)
 create mode 100755 hts_probe_cc.sh

diff --git a/Makefile b/Makefile
index 68734026d..f2d6c9cff 100644
--- a/Makefile
+++ b/Makefile
@@ -115,10 +115,16 @@ ALL_CPPFLAGS = -I. $(CPPFLAGS)
 htscodecs.mk:
 	echo '# Default htscodecs.mk generated by Makefile' > $@
 	echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@
+	$(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@
 
 srcdir = .
 srcprefix =
 HTSPREFIX =
+
+HTS_CFLAGS_AVX2 =
+HTS_CFLAGS_AVX512 =
+HTS_CFLAGS_SSE4 =
+
 include htslib_vars.mk
 include htscodecs.mk
 
@@ -216,10 +222,6 @@ NONCONFIGURE_OBJS = hfile_libcurl.o
 PLUGIN_EXT  =
 PLUGIN_OBJS =
 
-HTS_CFLAGS_AVX2 =
-HTS_CFLAGS_AVX512 =
-HTS_CFLAGS_SSE4 =
-
 cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h)
 cram_io_h = cram/cram_io.h $(cram_misc_h)
 cram_misc_h = cram/misc.h
@@ -258,6 +260,17 @@ config.h:
 	echo '#endif' >> $@
 	echo '#define HAVE_DRAND48 1' >> $@
 	echo '#define HAVE_LIBCURL 1' >> $@
+	if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \
+	    echo '#define HAVE_POPCNT 1' >> $@ ; \
+	    echo '#define HAVE_SSE4_1 1' >> $@ ; \
+	    echo '#define HAVE_SSSE3 1' >> $@ ; \
+	fi
+	if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \
+	    echo '#define HAVE_AVX2 1' >> $@ ; \
+	fi
+	if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \
+	    echo '#define HAVE_AVX512 1' >> $@ ; \
+	fi
 
 # And similarly for htslib.pc.tmp ("pkg-config template").  No dependency
 # on htslib.pc.in listed, as if that file is newer the usual way to regenerate
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
new file mode 100755
index 000000000..0f6ddede8
--- /dev/null
+++ b/hts_probe_cc.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+
+# Check compiler options for non-configure builds and create Makefile fragment
+#
+#    Copyright (C) 2022 Genome Research Ltd.
+#
+#    Author: Rob Davies <rmd@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Arguments are:
+# 1. C compiler command
+# 2. Initial CFLAGS
+# 3. LDFLAGS
+
+CC=$1
+CFLAGS=$2
+LDFLAGS=$3
+
+# Try running the compiler.  Uses the same contest.* names as
+# configure for temporary files.
+run_compiler ()
+{
+    "$CC" $CFLAGS $1 $LDFLAGS -o conftest conftest.c 2> conftest.err
+    retval=$?
+    rm -f conftest.err conftest
+    return $retval
+}
+
+echo "# Compiler probe results, generated by $0"
+
+# Check for sse4.1 etc. support
+
+rm -f conftest conftest.err conftest.c
+cat - <<'EOF' > conftest.c
+#include "x86intrin.h"
+int main(int argc, char **argv) {
+    unsigned int i = _mm_popcnt_u32(1);
+    __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1);
+    __m128i c = _mm_max_epu32(a, b);
+    b = _mm_shuffle_epi8(a, c);
+    return *((char *) &b);
+}
+EOF
+FLAGS="-mpopcnt -msse4.1 -mssse3"
+if run_compiler "$FLAGS" ; then
+    echo "HTS_CFLAGS_SSE4 = $FLAGS"
+fi
+
+# Check for avx2
+
+rm -f conftest.c
+cat - <<'EOF' > conftest.c
+#include "x86intrin.h"
+int main(int argc, char **argv) {
+    __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    __m256i b = _mm256_add_epi32(a, a);
+    return *((char *) &b);
+}
+EOF
+FLAGS="-mavx2"
+if run_compiler "$FLAGS" ; then
+    echo "HTS_CFLAGS_AVX2 = $FLAGS"
+fi
+
+# Check for avx512
+
+rm -f conftest.c
+cat - <<'EOF' > conftest.c
+#include "x86intrin.h"
+int main(int argc, char **argv) {
+    __m512i a = _mm512_set1_epi32(1);
+    __m512i b = _mm512_add_epi32(a, a);
+    return *((char *) &b);
+}
+EOF
+FLAGS="-mavx512f"
+if run_compiler "$FLAGS" ; then
+    echo "HTS_CFLAGS_AVX512 = $FLAGS"
+fi
+
+rm -f conftest.c

From aada31476c534322bcdb7616ad748341dd5a5ad9 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 20 May 2022 18:53:16 +0100
Subject: [PATCH 23/79] Disable unaligned access in htscodecs if it is in
 HTSlib

Mainly to ensure that the fuzzer build doesn't start complaining
about unaligned access.
---
 Makefile     | 3 +++
 configure.ac | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/Makefile b/Makefile
index f2d6c9cff..c8e394830 100644
--- a/Makefile
+++ b/Makefile
@@ -264,6 +264,9 @@ config.h:
 	    echo '#define HAVE_POPCNT 1' >> $@ ; \
 	    echo '#define HAVE_SSE4_1 1' >> $@ ; \
 	    echo '#define HAVE_SSSE3 1' >> $@ ; \
+	    echo '#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0' >> $@ ; \
+	    echo '#define UBSAN 1' >> $@ ; \
+	    echo '#endif' >> $@ ; \
 	fi
 	if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \
 	    echo '#define HAVE_AVX2 1' >> $@ ; \
diff --git a/configure.ac b/configure.ac
index 9c8cca480..c1578d6e4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,14 @@ AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [
             [Defined to 1 if the compiler can issue popcnt instructions.])
   AC_DEFINE([HAVE_SSE4_1],1,
             [Defined to 1 if the compiler can issue SSE4.1 instructions.])
+dnl Propagate HTSlib's unaligned access preference to htscodecs
+  AH_VERBATIM([UBSAN],[
+/* Prevent unaligned access in htscodecs SSE4 rANS codec */
+#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0
+#undef UBSAN
+#endif
+  ])
+  AC_DEFINE([UBSAN],1,[])
   ], [], [], [AC_LANG_PROGRAM([[
     #include "x86intrin.h"
   ]],[[

From 958e6fa708d1914bc46d9f8e9411987402468153 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 23 May 2022 12:08:43 +0100
Subject: [PATCH 24/79] Pull in extra htscodecs pointer aliasing fixes

---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index 65bb347f6..9cd552e17 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 65bb347f6b0ea7f4a00cb768b3d8004f24ae03c3
+Subproject commit 9cd552e173055730eb7701ebdbd13f6c579088e4

From e16c4224174a7581cf6f6b39705d62bbde007261 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Mon, 23 May 2022 17:37:25 +0100
Subject: [PATCH 25/79] Fix NEWS entry for function affected by
 realn_check_tag() fix

PR #1406 fixed which arguments were used as printf() format strings
in realn_check_tag(), which is a subroutine of sam_prob_realn().
Correct the name of the function affected, and add a note of which
HTSlib releases were affected.
---
 NEWS | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index 5f30d8878..75d9ce79e 100644
--- a/NEWS
+++ b/NEWS
@@ -4,10 +4,11 @@ Noteworthy changes in release a.b
 Noteworthy changes in release 1.15.1 (7th April 2022)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* Security fix: Fixed broken error reporting in the sam_cap_mapq()
+* Security fix: Fixed broken error reporting in the sam_prob_realn()
   function, due to a missing hts_log() parameter.  Prior to this fix
-  it was possible to abuse the log message format string by passing
-  a specially crafted alignment record to this function. (PR#1406)
+  (i.e., in HTSlib versions 1.8 to 1.15) it was possible to abuse
+  the log message format string by passing a specially crafted
+  alignment record to this function. (PR#1406)
 
 * HTSlib now uses libhtscodecs release 1.2.2.  This fixes a number
   of bugs where invalid compressed data could trigger usage of

From 008eabd3b38600a187b8d2fe94be9f8f9260545e Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 13 May 2022 16:25:09 +0100
Subject: [PATCH 26/79] Fix clang builds under mingw.

Under mingw clang requires dllexport to be applied to both function
declarations and function definitions.  (Gcc is happy for the
definition only to have the dllexport attribute.)

Note this exports several "internal" functions, including some
apparently unused anywhere (hts_json_*).   Perhaps they were once used
in htslib-plugins, but no more.

I haven't taken the decision to remove these though, but it's worth
considering for whenever we next do an ABI breaking change.  Either
that or stop pretending that their internal only when clearly they are
not, and move their API to the external htslib/*.h files instead.
These appear to be somewhat in Limbo right now.

Fixes #1433
---
 hfile_internal.h     | 6 ++++++
 htslib/hfile.h       | 6 ++++++
 htslib/hts.h         | 3 +++
 htslib/hts_log.h     | 1 +
 htslib/sam.h         | 1 +
 htslib/tbx.h         | 1 +
 htslib/vcf.h         | 3 +++
 textutils_internal.h | 8 ++++++++
 8 files changed, 29 insertions(+)

diff --git a/hfile_internal.h b/hfile_internal.h
index 70cc99c57..2e365ae7d 100644
--- a/hfile_internal.h
+++ b/hfile_internal.h
@@ -90,11 +90,13 @@ struct hFILE_backend {
 
 /* May be called by hopen_*() functions to decode a fopen()-style mode into
    open(2)-style flags.  */
+HTSLIB_EXPORT
 int hfile_oflags(const char *mode);
 
 /* Must be called by hopen_*() functions to allocate the hFILE struct and set
    up its base.  Capacity is a suggested buffer size (e.g., via fstat(2))
    or 0 for a default-sized buffer.  */
+HTSLIB_EXPORT
 hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity);
 
 /* Alternative to hfile_init() for in-memory backends for which the base
@@ -107,6 +109,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode,
 /* May be called by hopen_*() functions to undo the effects of hfile_init()
    in the event opening the stream subsequently fails.  (This is safe to use
    even if fp is NULL.  This takes care to preserve errno.)  */
+HTSLIB_EXPORT
 void hfile_destroy(hFILE *fp);
 
 
@@ -138,10 +141,13 @@ struct hFILE_scheme_handler {
 };
 
 /* May be used as an isremote() function in simple cases.  */
+HTSLIB_EXPORT
 extern int hfile_always_local (const char *fname);
+HTSLIB_EXPORT
 extern int hfile_always_remote(const char *fname);
 
 /* Should be called by plugins for each URL scheme they wish to handle.  */
+HTSLIB_EXPORT
 void hfile_add_scheme_handler(const char *scheme,
                               const struct hFILE_scheme_handler *handler);
 
diff --git a/htslib/hfile.h b/htslib/hfile.h
index 038591cbc..92b789acd 100644
--- a/htslib/hfile.h
+++ b/htslib/hfile.h
@@ -158,6 +158,7 @@ static inline off_t htell(hFILE *fp)
 */
 static inline int hgetc(hFILE *fp)
 {
+    HTSLIB_EXPORT
     extern int hgetc2(hFILE *);
     return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp);
 }
@@ -229,6 +230,7 @@ or I/O errors.
 static inline ssize_t HTS_RESULT_USED
 hread(hFILE *fp, void *buffer, size_t nbytes)
 {
+    HTSLIB_EXPORT
     extern ssize_t hread2(hFILE *, void *, size_t, size_t);
 
     size_t n = fp->end - fp->begin;
@@ -243,6 +245,7 @@ hread(hFILE *fp, void *buffer, size_t nbytes)
 */
 static inline int hputc(int c, hFILE *fp)
 {
+    HTSLIB_EXPORT
     extern int hputc2(int, hFILE *);
     if (fp->begin < fp->limit) *(fp->begin++) = c;
     else c = hputc2(c, fp);
@@ -254,6 +257,7 @@ static inline int hputc(int c, hFILE *fp)
 */
 static inline int hputs(const char *text, hFILE *fp)
 {
+    HTSLIB_EXPORT
     extern int hputs2(const char *, size_t, size_t, hFILE *);
 
     size_t nbytes = strlen(text), n = fp->limit - fp->begin;
@@ -271,7 +275,9 @@ In the absence of I/O errors, the full _nbytes_ will be written.
 static inline ssize_t HTS_RESULT_USED
 hwrite(hFILE *fp, const void *buffer, size_t nbytes)
 {
+    HTSLIB_EXPORT
     extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t);
+    HTSLIB_EXPORT
     extern int hfile_set_blksize(hFILE *fp, size_t bufsiz);
 
     if (!fp->mobile) {
diff --git a/htslib/hts.h b/htslib/hts.h
index 8f11266fa..801506b95 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -456,16 +456,19 @@ The input character may be either an IUPAC ambiguity code, '=' for 0, or
 '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
 for A/C/G/T or combinations of these bits for ambiguous bases.
 */
+HTSLIB_EXPORT
 extern const unsigned char seq_nt16_table[256];
 
 /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
 ambiguity code letter (or '=' when given 0).
 */
+HTSLIB_EXPORT
 extern const char seq_nt16_str[];
 
 /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
 */
+HTSLIB_EXPORT
 extern const int seq_nt16_int[];
 
 /*!
diff --git a/htslib/hts_log.h b/htslib/hts_log.h
index b2336a4df..f6a50b333 100644
--- a/htslib/hts_log.h
+++ b/htslib/hts_log.h
@@ -58,6 +58,7 @@ enum htsLogLevel hts_get_log_level(void);
  * One of the HTS_LOG_* values. The default is HTS_LOG_WARNING.
  * \note Avoid direct use of this variable. Use hts_set_log_level and hts_get_log_level instead.
  */
+HTSLIB_EXPORT
 extern int hts_verbose;
 
 /*! Logs an event.
diff --git a/htslib/sam.h b/htslib/sam.h
index a6e64fbb2..f0a191a28 100644
--- a/htslib/sam.h
+++ b/htslib/sam.h
@@ -118,6 +118,7 @@ typedef sam_hdr_t bam_hdr_t;
 Result is operator code or -1. Be sure to cast the index if it is a plain char:
     int op = bam_cigar_table[(unsigned char) ch];
 */
+HTSLIB_EXPORT
 extern const int8_t bam_cigar_table[256];
 
 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
diff --git a/htslib/tbx.h b/htslib/tbx.h
index 9b9e111b9..3d2037cbb 100644
--- a/htslib/tbx.h
+++ b/htslib/tbx.h
@@ -52,6 +52,7 @@ typedef struct tbx_t {
     void *dict;
 } tbx_t;
 
+HTSLIB_EXPORT
 extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf;
 
     #define tbx_itr_destroy(iter) hts_itr_destroy(iter)
diff --git a/htslib/vcf.h b/htslib/vcf.h
index 7a001aca6..8f7d79fe3 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -123,6 +123,7 @@ typedef struct bcf_hdr_t {
     int32_t m[3];          // m: allocated size of the dictionary block in use (see n above)
 } bcf_hdr_t;
 
+HTSLIB_EXPORT
 extern uint8_t bcf_type_shift[];
 
 /**************
@@ -1341,7 +1342,9 @@ which works for both BCF and VCF.
 #define BCF_MIN_BT_INT16 (-32760)      /* INT16_MIN + 8 */
 #define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */
 
+HTSLIB_EXPORT
 extern uint32_t bcf_float_vector_end;
+HTSLIB_EXPORT
 extern uint32_t bcf_float_missing;
 static inline void bcf_float_set(float *ptr, uint32_t value)
 {
diff --git a/textutils_internal.h b/textutils_internal.h
index 4b120bdbc..1ad096494 100644
--- a/textutils_internal.h
+++ b/textutils_internal.h
@@ -65,9 +65,11 @@ typedef struct hts_json_token hts_json_token;
 /// Allocate an empty JSON token structure, for use with hts_json_* functions
 /** @return An empty token on success; NULL on failure
  */
+HTSLIB_EXPORT
 hts_json_token *hts_json_alloc_token(void);
 
 /// Free a JSON token
+HTSLIB_EXPORT
 void hts_json_free_token(hts_json_token *token);
 
 /// Accessor function to get JSON token type
@@ -85,6 +87,7 @@ as follows:
   - `!` other errors (e.g. out of memory)
   - `\0` terminator at end of input
 */
+HTSLIB_EXPORT
 char hts_json_token_type(hts_json_token *token);
 
 /// Accessor function to get JSON token in string form
@@ -98,6 +101,7 @@ will point at the kstring_t buffer passed as the third parameter to
 hts_json_fnext().  In that case, the value will only be valid until the
 next call to hts_json_fnext().
  */
+HTSLIB_EXPORT
 char *hts_json_token_str(hts_json_token *token);
 
 /// Read one JSON token from a string
@@ -111,6 +115,7 @@ is modified by having token-terminating characters overwritten as NULs.
 The `state` argument records the current position within `str` after each
 `hts_json_snext()` call, and should be set to 0 before the first call.
 */
+HTSLIB_EXPORT
 char hts_json_snext(char *str, size_t *state, hts_json_token *token);
 
 /// Read and discard a complete JSON value from a string
@@ -123,6 +128,7 @@ char hts_json_snext(char *str, size_t *state, hts_json_token *token);
 Skips a complete JSON value, which may be a single token or an entire object
 or array.
 */
+HTSLIB_EXPORT
 char hts_json_sskip_value(char *str, size_t *state, char type);
 
 struct hFILE;
@@ -137,6 +143,7 @@ The `kstr` buffer is used to store the string value of the token read,
 so `token->str` is only valid until the next time `hts_json_fnext()` is
 called with the same `kstr` argument.
 */
+HTSLIB_EXPORT
 char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr);
 
 /// Read and discard a complete JSON value from a file
@@ -148,6 +155,7 @@ char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr);
 Skips a complete JSON value, which may be a single token or an entire object
 or array.
 */
+HTSLIB_EXPORT
 char hts_json_fskip_value(struct hFILE *fp, char type);
 
 // The <ctype.h> functions operate on ints such as are returned by fgetc(),

From 88ccb034fca1f8e140f554cccc53d355f0a06da2 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 30 May 2022 18:34:13 +0100
Subject: [PATCH 27/79] Fix curl type warning on Windows with gcc 12.1

curl_easy_setopt(handle, CURLOPT_POSTFIELDSIZE, val) takes a long,
but was being passed a size_t.  This works on Linux where they're
the same size for both 64 and 32-bit platforms, but not on 64-bit
Windows which has 32-bit longs and 64-bit size_t.

Casting the value to long should be fine.  The POST data is for
the completion message which should not get that big due to the limit
of 10000 parts when uploading data.
---
 hfile_s3_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hfile_s3_write.c b/hfile_s3_write.c
index eec56696b..d54945839 100644
--- a/hfile_s3_write.c
+++ b/hfile_s3_write.c
@@ -321,7 +321,7 @@ static int complete_upload(hFILE_s3_write *fp, kstring_t *resp) {
     curl_easy_reset(fp->curl);
     curl_easy_setopt(fp->curl, CURLOPT_POST, 1L);
     curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, fp->completion_message.s);
-    curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, fp->completion_message.l);
+    curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, (long) fp->completion_message.l);
     curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback);
     curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp);
     curl_easy_setopt(fp->curl, CURLOPT_URL, url.s);

From 7fef9d3acaf4ea51dcd6a9fd74742e17c90ec44a Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 27 May 2022 12:19:38 +0100
Subject: [PATCH 28/79] Add an "sclen" expression keyword.

This is the length of soft-clips, both left and right end.
It may be combined with qlen (qlen-sclen) to obtain the number of
bases in the query sequence that have been aligned to the genome.  Ie
it provides a way to compare local-alignment vs global-alignment
length.

Fixes #1436
---
 sam.c                      | 27 +++++++++++++++++++++++++++
 test/sam_filter/filter.tst |  3 +++
 test/sam_filter/func5.out  |  5 +++++
 test/sam_filter/func6.out  |  2 ++
 test/sam_filter/func7.out  |  3 +++
 5 files changed, 40 insertions(+)
 create mode 100644 test/sam_filter/func5.out
 create mode 100644 test/sam_filter/func6.out
 create mode 100644 test/sam_filter/func7.out

diff --git a/sam.c b/sam.c
index 865b55fa6..64c08a43a 100644
--- a/sam.c
+++ b/sam.c
@@ -1354,6 +1354,33 @@ static int bam_sym_lookup(void *data, char *str, char **end,
             res->s.l = b->core.l_qseq;
             res->is_str = 1;
             return 0;
+        } else if (memcmp(str, "sclen", 5) == 0) {
+            int sclen = 0;
+            uint32_t *cigar = bam_get_cigar(b);
+            int ncigar = b->core.n_cigar;
+            int left = 0;
+
+            // left
+            if (ncigar > 0
+                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
+                left = 0, sclen += bam_cigar_oplen(cigar[0]);
+            else if (ncigar > 1
+                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
+                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
+                left = 1, sclen += bam_cigar_oplen(cigar[1]);
+
+            // right
+            if (ncigar-1 > left
+                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
+                sclen += bam_cigar_oplen(cigar[ncigar-1]);
+            else if (ncigar-2 > left
+                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
+                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
+                sclen += bam_cigar_oplen(cigar[ncigar-2]);
+
+            *end = str+5;
+            res->d = sclen;
+            return 0;
         }
         break;
 
diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst
index effb77a26..129516b24 100644
--- a/test/sam_filter/filter.tst
+++ b/test/sam_filter/filter.tst
@@ -53,3 +53,6 @@ P func1.out   $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@'
 P func2.out   $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@'
 P func3.out   $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@'
 P func4.out   $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@'
+P func5.out   $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@'
+P func6.out   $tv -i 'filter=rlen<50'   ../realn02.sam | egrep -v '^@'
+P func7.out   $tv -i 'filter=qlen>100'   ../realn02.sam | egrep -v '^@'
diff --git a/test/sam_filter/func5.out b/test/sam_filter/func5.out
new file mode 100644
index 000000000..6c2e2bc64
--- /dev/null
+++ b/test/sam_filter/func5.out
@@ -0,0 +1,5 @@
+ERR013140.3521432	99	17	1	29	22S86M	=	226	313	AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA	@AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=>	MD:Z:86	RG:Z:rg	AM:i:29	NM:i:0	SM:i:29	MQ:i:29	XT:A:M
+ERR156632.12704932	163	17	1	29	36S64M	=	195	293	TGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTG	BFAFGFEIGFEFHHEIDKJGHHHJIIE=@KKGGKJGIBLLMFKMDIIHJKKHFELLLKFIHMHIHHIHLKJFCHFJIJAID=JHKFGHJIHKKCH:@HD?	MD:Z:64	RG:Z:rg	AM:i:29	NM:i:0	SM:i:29	MQ:i:29	XT:A:M
+ERR156632.9601178	99	17	1	29	62S38M	=	279	377	CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA	DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE	MD:Z:38	RG:Z:rg	AM:i:29	NM:i:0	SM:i:29	MQ:i:29	XT:A:M
+ERR013140.13475139	99	17	2401	60	88M20S	=	2680	386	AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT	@CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:<C><??>@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228%	X0:i:1	X1:i:0	XC:i:88	MD:Z:88	RG:Z:rg	AM:i:37	NM:i:0	SM:i:37	MQ:i:60	XT:A:U
+ERR013140.23480670	133	17	3771	0	35M73S	=	3771	0	TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC	8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$	XC:i:35	RG:Z:rg
diff --git a/test/sam_filter/func6.out b/test/sam_filter/func6.out
new file mode 100644
index 000000000..de091ed96
--- /dev/null
+++ b/test/sam_filter/func6.out
@@ -0,0 +1,2 @@
+ERR156632.9601178	99	17	1	29	62S38M	=	279	377	CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA	DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE	MD:Z:38	RG:Z:rg	AM:i:29	NM:i:0	SM:i:29	MQ:i:29	XT:A:M
+ERR013140.23480670	133	17	3771	0	35M73S	=	3771	0	TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC	8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$	XC:i:35	RG:Z:rg
diff --git a/test/sam_filter/func7.out b/test/sam_filter/func7.out
new file mode 100644
index 000000000..1fe2500bf
--- /dev/null
+++ b/test/sam_filter/func7.out
@@ -0,0 +1,3 @@
+ERR013140.3521432	99	17	1	29	22S86M	=	226	313	AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA	@AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=>	MD:Z:86	RG:Z:rg	AM:i:29	NM:i:0	SM:i:29	MQ:i:29	XT:A:M
+ERR013140.13475139	99	17	2401	60	88M20S	=	2680	386	AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT	@CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:<C><??>@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228%	X0:i:1	X1:i:0	XC:i:88	MD:Z:88	RG:Z:rg	AM:i:37	NM:i:0	SM:i:37	MQ:i:60	XT:A:U
+ERR013140.23480670	133	17	3771	0	35M73S	=	3771	0	TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC	8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$	XC:i:35	RG:Z:rg

From 33ff2bcc49ddc47b7eb9ced7bf06446b1984746e Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 4 May 2022 16:45:54 +0100
Subject: [PATCH 29/79] Add an embed_ref=2 mode which starts with ref=N and
 edits on the fly.

We compute reference from seq + cigar + MD tag.  This gets used
immediately as it's part of the reference-based encoding in
process_one_read.

If we have inconsistent records where the inferred reference differs,
then we would end up with an invalid embedded reference and decode
errors.  Instead we just accept the first one as we cannot go back and
correct earlier mistakes.  It's a little tricky when the sequence may
contain "N" characters, as that's also the initial value of unassigned
reference, so we have to be careful there.

This also opens the way for handling CRAM files without any MD tags.
In that scenario we just assume there are no SNPs and create a fake MD
tag (just numbers merging match/mismatch and ^N* for deletions).  So
we can now do embedded reference on files with no reference and no
MD.  It's not quite consensus (which would be better), but that would
require a two-pass encoding strategy in place of process_one_read.  So
it's a bit poor on high-error technologies.

Note, when embedded auto-generated reference where MD tags may be
absent, the auto-decoded MD may be incorrect as the reference we have
embedded is now the sequence as-is rather than the real sequence.
---
 cram/cram_encode.c  | 254 +++++++++++++++++++++++++++++++++++++++++---
 cram/cram_io.c      |   8 +-
 cram/cram_structs.h |   2 +
 3 files changed, 250 insertions(+), 14 deletions(-)

diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index d35643a92..828ce089a 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/stat.h>
 #include <math.h>
 #include <inttypes.h>
+#include <ctype.h>
 
 #include "cram.h"
 #include "os.h"
@@ -1438,19 +1439,29 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             goto_err;
         bam_seq_t *b = c->bams[0];
 
-        char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
-        if (!ref && bam_ref(b) >= 0) {
-            hts_log_error("Failed to load reference #%d", bam_ref(b));
-            return -1;
-        }
-        if ((c->ref_id = bam_ref(b)) >= 0) {
-            c->ref_seq_id = c->ref_id;
-            c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
-            c->ref_start = 1;
-            c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
+        if (fd->embed_ref <= 1) {
+            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
+            if (!ref && bam_ref(b) >= 0) {
+                hts_log_error("Failed to load reference #%d", bam_ref(b));
+                return -1;
+            }
+            if ((c->ref_id = bam_ref(b)) >= 0) {
+                c->ref_seq_id = c->ref_id;
+                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
+                c->ref_start = 1;
+                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
+            }
         } else {
-            c->ref_seq_id = c->ref_id; // FIXME remove one var!
+            // Auto-embed ref.
+            // This starts as 'N' and is amended on-the-fly as we go
+            // based on MD:Z tags.
+            if ((c->ref_id = bam_ref(b)) >= 0) {
+                c->ref_free = 1;
+                if (c->ref) abort();
+                c->ref = NULL;
+            }
         }
+        c->ref_seq_id = c->ref_id;
     } else {
         c->ref_id = bam_ref(c->bams[0]);
         cram_ref_incr(fd->refs, c->ref_id);
@@ -2722,6 +2733,216 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
     return c;
 }
 
+// Returns the next cigar op code: one of the BAM_C* codes,
+// or -1 if no more are present.
+static inline
+int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos,
+                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
+    for(;;) {
+        while (*cig_len == 0) {
+            if (*cig_ind < *ncigar) {
+                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
+                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
+                (*cig_ind)++;
+            } else {
+                return -1;
+            }
+        }
+
+        if (skip[*cig_op]) {
+            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
+            *cig_len = 0;
+            continue;
+        }
+
+        (*cig_len)--;
+        break;
+    }
+
+    return *cig_op;
+}
+
+// Set a base in the computed reference.
+// As we fill this out record by record as we go, and we encode the
+// sequence against the reference we've computed so far, once we set a
+// reference is must never change.  So if the reference inferred by one
+// SEQ+MD differs to the reference inferred by another SEQ+MD, the latter
+// is warned about and the reference remains unchanged to ensure
+// round-trips.
+//
+// In order to spot N->N->G type edits, where "N" and "G" are two inferred
+// ref from two sequences, we use N->n->G and patch up the lowercase n later.
+// Similarly where the data is unvalidated (faked up MD tag) also get
+// assigned lowercase letters.  This prevents false warnings when mixing data
+// with and without MD tags.
+static inline void assign_ref(char *ref, char *set, int pos,
+                              unsigned char base, int validate) {
+    base = base & ~0x20; // fast toupper for ASCII
+#if 1
+    if (!set[pos] || ref[pos] == base) {
+        ref[pos] = base;
+        set[pos] = 1;
+    }
+#else
+    // Optional reporting.  It's 7% additional CPU cost in process_one_read,
+    // and maybe not appropriate anyway given there's nothing we can do to
+    // correct this either than ignore it.  It'd need update to explain the
+    // position too.
+    if (!set[pos] || ref[pos] == base) {
+        ref[pos] = base;
+        set[pos] = validate ? 1 : 2; // actual MD:Z or guesswork/fake
+    } else if (validate && set[pos] == 1) {
+        hts_log_warning("Incompatible MD:Z tags between records");
+    }
+#endif
+}
+
+static int cram_extend_ref(cram_container *c, bam1_t *b) {
+    hts_pos_t end = bam_endpos(b);
+
+    if (!c->ref)
+        c->ref_start = b->core.pos+1;
+        //c->ref_start = 1; // FIXME, needs to be b->core.pos, but fails
+
+    if (end >= c->ref_end) {
+        hts_pos_t old_end = c->ref ? c->ref_end : c->ref_start;
+        c->ref_end = end + 1000 + (end - c->ref_start)*1.5;
+
+        char *r = realloc(c->ref, c->ref_end+1 - c->ref_start);
+        if (!r) return -1;
+        c->ref = r;
+
+        r = realloc(c->ref_set, c->ref_end+1 - c->ref_start);
+        if (!r) return -1;
+        c->ref_set = r;
+
+        memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end);
+        memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end);
+    }
+
+    return 0;
+}
+
+// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference.
+// Updates ref[] array.
+//
+// Returns >0 on success,
+//          0 on no-MD found,
+//         -1 on failure (eg inconsistent data)
+static int cram_build_ref(bam1_t *b, const uint8_t *MD,
+                          char *ref, char *ref_set,
+                          hts_pos_t ref_start, hts_pos_t ref_len) {
+    uint8_t *seq = bam_get_seq(b);
+    uint32_t *cigar = bam_get_cigar(b);
+    int ncigar = b->core.n_cigar;
+    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
+    kstring_t fake_MD = KS_INITIALIZE;
+    int validate = 1;
+
+    if (!MD || *MD != 'Z') {
+        // Fake it!
+        int i, err = 0;
+        int run_len = 0;
+        for (i = 0; i < ncigar; i++) {
+            switch(cigar[i] & BAM_CIGAR_MASK) {
+            case BAM_CMATCH:
+            case BAM_CEQUAL:
+            case BAM_CDIFF:
+                run_len += cigar[i] >> BAM_CIGAR_SHIFT;
+                break;
+
+            case BAM_CDEL:
+                err |= ksprintf(&fake_MD, "%d", run_len) < 0;
+                run_len = 0;
+                err |= kputc('^', &fake_MD);
+                for (int j = 0; j < cigar[i] >> BAM_CIGAR_SHIFT; j++)
+                    err |= kputc('N', &fake_MD);
+                break;
+            }
+        }
+        if (run_len)
+            err |= ksprintf(&fake_MD, "%d", run_len) < 0;
+        MD = (uint8_t *)fake_MD.s;
+        if (err < 0)
+            return -1;
+
+        validate = 0;
+    } else {
+        MD++;
+    }
+
+    // Walk through MD + seq to generate ref
+    int iseq = 0, next_op;
+    hts_pos_t iref = b->core.pos+1 - ref_start;
+    int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
+    while (iseq < b->core.l_qseq && MD && *MD) {
+        if (isdigit(*MD)) {
+            // match
+            int len = strtol((char *)MD, (char **)&MD, 10);
+            while (iseq < b->core.l_qseq && len) {
+                if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+                                             &iseq, &cig_ind, &cig_op,
+                                             &cig_len)) < 0)
+                    return -1;
+
+                if (next_op != BAM_CMATCH &&
+                    next_op != BAM_CEQUAL) {
+                    hts_log_warning("MD:Z and CIGAR are incompatible for "
+                                    "record %s", bam_get_qname(b));
+                    return -1;
+                }
+
+                if (iref < ref_len)
+                    assign_ref(ref, ref_set, iref,
+                               seq_nt16_str[bam_seqi(seq, iseq)], validate);
+                iseq++;
+                iref++;
+                len--;
+            }
+        } else if (*MD == '^') {
+            // deletion
+            MD++;
+            while (*MD && isalpha(*MD)) {
+                if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+                                             &iseq, &cig_ind, &cig_op,
+                                             &cig_len)) < 0)
+                    return -1;
+
+                if (next_op != BAM_CDEL) {
+                    hts_log_warning("MD:Z and CIGAR are incompatible");
+                    return -1;
+                }
+
+                if (iref < ref_len)
+                    assign_ref(ref, ref_set, iref, toupper(*MD), validate);
+
+                MD++;
+                iref++;
+            }
+        } else {
+            // substitution
+            if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+                                         &iseq, &cig_ind, &cig_op,
+                                         &cig_len)) < 0)
+                return -1;
+
+            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
+                hts_log_warning("MD:Z and CIGAR are incompatible");
+                return -1;
+            }
+            if (iref < ref_len)
+                assign_ref(ref, ref_set, iref, toupper(*MD), validate);
+
+            MD++;
+            iref++;
+            iseq++;
+        }
+    }
+
+    ks_free(&fake_MD);
+    return 1;
+}
+
 /*
  * Converts a single bam record into a cram record.
  * Possibly used within a thread.
@@ -2746,16 +2967,23 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     // FIXME: multi-ref containers
 
-    ref = c->ref;
     cr->flags       = bam_flag(b);
     cr->len         = bam_seq_len(b);
-    if (!bam_aux_get(b, "MD"))
+    uint8_t *md;
+    if (!(md = bam_aux_get(b, "MD")))
         MD = NULL;
     else
         MD->l = 0;
 
+    if (/*md &&*/ fd->embed_ref == 2) {
+        // Auto-generate and embed ref
+        cram_extend_ref(c, b);
+        cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end);
+    }
+
     //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
 
+    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
     cr->ref_id      = bam_ref(b);
     if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
         goto block_err;
diff --git a/cram/cram_io.c b/cram/cram_io.c
index 60a568b7b..e0d203469 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -3639,6 +3639,7 @@ cram_container *cram_new_container(int nrec, int nslice) {
     if (!(c->tags_used = kh_init(m_tagmap)))
         goto err;
     c->refs_used = 0;
+    c->ref_free = 0;
 
     return c;
 
@@ -3711,6 +3712,11 @@ void cram_free_container(cram_container *c) {
         kh_destroy(m_tagmap, c->tags_used);
     }
 
+    if (c->ref_free) {
+        free(c->ref);
+        free(c->ref_set);
+    }
+
     free(c);
 }
 
@@ -4820,7 +4826,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
     }
 
     /* Fix M5 strings */
-    if (fd->refs && !fd->no_ref) {
+    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
         int i;
         for (i = 0; i < hdr->hrecs->nref; i++) {
             sam_hrec_type_t *ty;
diff --git a/cram/cram_structs.h b/cram/cram_structs.h
index ce27bc1a4..e03a34e11 100644
--- a/cram/cram_structs.h
+++ b/cram/cram_structs.h
@@ -473,6 +473,8 @@ struct cram_container {
     uint64_t s_num_bases; // number of bases in this slice
 
     uint32_t n_mapped;    // Number of mapped reads
+    int ref_free;         // whether 'ref' is owned by us and must be freed.
+    char *ref_set;        // same size as ref.  Only set for auto embed_ref
 };
 
 /*

From 764edbd24adf20b938a7f70160802d4cc5846224 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 30 May 2022 14:27:26 +0100
Subject: [PATCH 30/79] Add a cF CRAM specific tag.

This is used to indicate when MD and NM were not present and should
not be regenerated during decode.

Bit 1 is set when MD shouldn't be produced and bit 2 when NM shouldn't
be.  In both cases this tag is only created when embed_ref=2 and MD
and/or NM is absent from the input data.  In this scenario we cannot
reproduce the reference from SEQ+MD and cannot therefore be certain
that the value reproduced is correct.  E.g. if the reference is
produced by consensus alone, then MD is a diff of this read vs
consensus and not this read vs the original reference used in by the
aligner.

The cF tag is automatically stripped out during decode, but only with
this version of htslib and above.  Older versions will just emit a
private-space aux tag (which hopefully is harmless except for the
unlikely event of it clashing with another private name-space tool).
---
 cram/cram_decode.c | 18 ++++++++++++++++++
 cram/cram_encode.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/cram/cram_decode.c b/cram/cram_decode.c
index 51f1b765c..1f8d60f12 100644
--- a/cram/cram_decode.c
+++ b/cram/cram_decode.c
@@ -2037,12 +2037,30 @@ static int cram_decode_aux(cram_fd *fd,
             m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
             if (!m)
                 return -1;
+
             BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
 
             if (!m->codec) return -1;
             r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
             if (r) break;
             cr->aux_size += out_sz + 3;
+
+            // cF CRAM flags.
+            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
+                // Remove cF tag
+                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
+                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
+                cr->aux_size -= out_sz+3;
+
+                // bit 1 => don't auto-decode MD.
+                // Pretend MD is present verbatim, so we don't auto-generate
+                if ((cF & 1) && has_MD && *has_MD == 0)
+                    *has_MD = 1;
+
+                // bit 1 => don't auto-decode NM
+                if ((cF & 2) && has_NM && *has_NM == 0)
+                    *has_NM = 1;
+            }
         }
     }
 
diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 828ce089a..72f21c575 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -2229,7 +2229,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
                              cram_slice *s, cram_record *cr,
                              int verbatim_NM, int verbatim_MD,
-                             int NM, kstring_t *MD,
+                             int NM, kstring_t *MD, int cf_tag,
                              int *err) {
     char *aux, *orig, *rg = NULL;
     int aux_size = bam_get_l_aux(b);
@@ -2242,6 +2242,24 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
 
     orig = aux = (char *)bam_aux(b);
 
+
+    // cF:i  => Extra CRAM bit flags.
+    // 1:  Don't auto-decode MD (may be invalid)
+    // 2:  Don't auto-decode NM (may be invalid)
+    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
+        // Temporary copy of aux so we can ammend it.
+        aux = malloc(aux_size+4);
+        if (!aux)
+            return NULL;
+
+        memcpy(aux, orig, aux_size);
+        aux[aux_size++] = 'c';
+        aux[aux_size++] = 'F';
+        aux[aux_size++] = 'C';
+        aux[aux_size++] = cf_tag;
+        orig = aux;
+    }
+
     // Copy aux keys to td_b and aux values to slice aux blocks
     while (aux - orig < aux_size && aux[0] != 0) {
         int r;
@@ -2604,11 +2622,16 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
     if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
         goto block_err;
 
+    if (orig != (char *)bam_aux(b))
+        free(orig);
+
     if (err) *err = 0;
     return rg;
 
  err:
  block_err:
+    if (orig != (char *)bam_aux(b))
+        free(orig);
     return NULL;
 }
 
@@ -2975,10 +2998,13 @@ static int process_one_read(cram_fd *fd, cram_container *c,
     else
         MD->l = 0;
 
+    int cf_tag = 0;
     if (/*md &&*/ fd->embed_ref == 2) {
         // Auto-generate and embed ref
         cram_extend_ref(c, b);
         cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end);
+        cf_tag  = MD ? 0 : 1;                   // No MD
+        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
     }
 
     //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
@@ -3297,7 +3323,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
     int err = 0;
-    rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err);
+    rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
+                         cf_tag, &err);
     if (err)
         goto block_err;
 

From e97816474e5facdb0ef23df8be7e446eea474f24 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 1 Jun 2022 14:43:13 +0100
Subject: [PATCH 31/79] Remove unneeded abort check in cram encoding.

Also fixed a potential illegal memory access caused by the return
value of cram_encode_aux, and added more belt and braces memory free
requests for the new embed_ref=2 option.
---
 cram/cram_encode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 72f21c575..368f6e0b4 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -1457,7 +1457,6 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             // based on MD:Z tags.
             if ((c->ref_id = bam_ref(b)) >= 0) {
                 c->ref_free = 1;
-                if (c->ref) abort();
                 c->ref = NULL;
             }
         }
@@ -2626,7 +2625,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
         free(orig);
 
     if (err) *err = 0;
-    return rg;
+
+    // rg from within bam_aux, not rg from our aux copy.
+    return rg ? (char *)bam_aux(b) + (rg - orig) : NULL;
 
  err:
  block_err:
@@ -2841,6 +2842,7 @@ static int cram_extend_ref(cram_container *c, bam1_t *b) {
 
         memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end);
         memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end);
+        c->ref_free = 1;
     }
 
     return 0;

From 2978708540a17a66fd83133ed4aca34faa2ff6bd Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 7 Jun 2022 18:22:00 +0100
Subject: [PATCH 32/79] Remove limit on returned size from fai_retrieve()

This was probably a left-over from the transition to 64-bit
positions in HTSlib.  Having the limit in fai_retrieve() caused
very long references to be truncated even though programs like
`samtools faidx` should be able to support them (see issue
samtools/samtools#1660 - samtools faidx fails to retrieve large
scaffolds).

The limit is useful for legacy faidx interfaces that return
the size in an `int *`, so tests for sizes over INT_MAX
have been applied to them.
---
 faidx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/faidx.c b/faidx.c
index 4b25d3918..f3be5e57c 100644
--- a/faidx.c
+++ b/faidx.c
@@ -731,7 +731,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
     }
 
     s[l] = '\0';
-    *len = l < INT_MAX ? l : INT_MAX;
+    *len = l;
     return s;
 }
 
@@ -784,7 +784,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)
 {
     hts_pos_t len64;
     char *ret = fai_fetch64(fai, str, &len64);
-    *len = len64; // trunc
+    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
     return ret;
 }
 
@@ -803,7 +803,7 @@ char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
 char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
     hts_pos_t len64;
     char *ret = fai_fetchqual64(fai, str, &len64);
-    *len = len64; // trunc
+    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
     return ret;
 }
 
@@ -876,7 +876,7 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p
 {
     hts_pos_t len64;
     char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
-    *len = len64;  // trunc
+    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
     return ret;
 }
 
@@ -897,7 +897,7 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int
 {
     hts_pos_t len64;
     char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
-    *len = len64;  // trunc
+    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
     return ret;
 }
 

From f46597e7ec497e20903c423298bef61e83d8f6b4 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 29 Apr 2022 12:03:33 +0100
Subject: [PATCH 33/79] Expose a bit more of the CRAM API.

This is to enable samtools cram2ref.
---
 cram/cram_external.c | 13 +++++++++++++
 htslib/cram.h        | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/cram/cram_external.c b/cram/cram_external.c
index 314826932..098accde9 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -188,6 +188,19 @@ int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
     return hdr->num_blocks;
 }
 
+int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) {
+    return h->ref_base_id;
+}
+
+void cram_slice_hdr_get_coords(cram_block_slice_hdr *h,
+                               int *refid, hts_pos_t *start, hts_pos_t *span) {
+    if (refid)
+        *refid = h->ref_seq_id;
+    if (start)
+        *start = h->ref_seq_start;
+    if (span)
+        *span  = h->ref_seq_span;
+}
 
 /*
  *-----------------------------------------------------------------------------
diff --git a/htslib/cram.h b/htslib/cram.h
index dab666345..afeeb3711 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -247,6 +247,46 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
 HTSLIB_EXPORT
 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
 
+/*
+ *-----------------------------------------------------------------------------
+ * cram slice interrogation
+ */
+
+/*
+ * Returns the number of cram blocks within this slice.
+ */
+HTSLIB_EXPORT
+int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr);
+
+/*
+ * Returns the block content_id for the block containing an embedded reference
+ * sequence.  If none is present, -1 is returned.
+ */
+HTSLIB_EXPORT
+int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h);
+
+/*
+ * Returns slice reference ID, start and span (length) coordinates.
+ * Return parameters may be NULL in which case they are ignored.
+ */
+HTSLIB_EXPORT
+void cram_slice_hdr_get_coords(cram_block_slice_hdr *h,
+                               int *refid, hts_pos_t *start, hts_pos_t *span);
+
+/*
+ * Decodes a slice header from a cram block.
+ * Returns the opaque cram_block_slice_hdr pointer on success,
+ *         NULL on failure.
+ */
+HTSLIB_EXPORT
+cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
+
+/*
+ * Frees a cram_block_slice_hdr structure.
+ */
+HTSLIB_EXPORT
+void cram_free_slice_header(cram_block_slice_hdr *hdr);
+
 /*
  *-----------------------------------------------------------------------------
  * cram_io basics

From fee3bbbe58a9bc4af716b1af4934c1f76faa8f56 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Thu, 9 Jun 2022 13:01:48 +0100
Subject: [PATCH 34/79] CVE-2020-36403 affected all older versions of HTSlib

This issue was fixed in 1.11 by PRs #1044 and #1104. It was detected via
fuzz testing (https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=24097)
but the Reproducer Testcase also has an invalid `#CHROM` line which
resulted in an error message in HTSlib versions <= 1.9.

This error message masked the segfault caused by the actual issue, namely
a VCF record whose in-memory representation requires more than 2GiB.
A clean test case produces a segfault all the way back to HTSlib 1.0.
---
 NEWS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index 75d9ce79e..53a0d3c34 100644
--- a/NEWS
+++ b/NEWS
@@ -723,7 +723,7 @@ Bug fixes
 
 * Fixed potential integer overflows in the VCF parser and ensured that
   the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044,
-  #1104; latter is CVE-2020-36403 affecting HTSlib versions 1.10 to 1.10.2)
+  #1104; latter is CVE-2020-36403 affecting all HTSlib versions up to 1.10.2)
 
 * Download index files atomically in idx_test_and_fetch().  This prevents
   corruption when running parallel jobs on S3 files.  Thanks to John Marshall.

From 1109c8bce1248df001e1b3550a6b1bb58dfaa1c0 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 10 Jun 2022 13:20:51 +0100
Subject: [PATCH 35/79] Cap hts_getline() return value at INT_MAX

On success, hts_getline() returns the length of the string read.
Its return type is int, so when plain int is 32 bits, trouble ensues
for very long lines exceeding 2GiB: the return value wraps to negative
and is misinterpreted as error. Rather than changing the return type
to e.g. ssize_t, clamp the return value for very long lines.

In test/sam.c's test cases, check the return value is indeed the
expected length.
---
 hts.c        | 2 +-
 htslib/hts.h | 2 +-
 test/sam.c   | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hts.c b/hts.c
index d06c10891..0e9257e76 100644
--- a/hts.c
+++ b/hts.c
@@ -1899,7 +1899,7 @@ int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
     case no_compression:
         str->l = 0;
         ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
-        if (ret >= 0) ret = str->l;
+        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
         else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
         else ret = -1;
         break;
diff --git a/htslib/hts.h b/htslib/hts.h
index 801506b95..d354b2e2c 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -676,7 +676,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...);
   @param fp         The file handle
   @param delimiter  Unused, but must be '\n' (or KS_SEP_LINE)
   @param str        The line (not including the terminator) is written here
-  @return           Length of the string read;
+  @return           Length of the string read (capped at INT_MAX);
                     -1 on end-of-file; <= -2 on error
 */
 HTSLIB_EXPORT
diff --git a/test/sam.c b/test/sam.c
index cc5bfe77a..49d9210c3 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -1525,7 +1525,11 @@ static void test_text_file(const char *filename, int nexp)
     if (in) {
         kstring_t str = KS_INITIALIZE;
         int ret, n = 0;
-        while ((ret = hts_getline(in, '\n', &str)) >= 0) n++;
+        while ((ret = hts_getline(in, '\n', &str)) >= 0) {
+            size_t len = strlen(str.s);
+            n++;
+            if (ret != len) fail("hts_getline read length %d (expected %zu)", ret, len);
+        }
         if (ret != -1) fail("hts_getline got an error from %s", filename);
         if (n != nexp) fail("hts_getline read %d lines from %s (expected %d)", n, filename, nexp);
 

From 41f9aaa19ece0f151d10e98423c173d3e95c9c66 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Mon, 13 Jun 2022 12:51:33 +0100
Subject: [PATCH 36/79] Detect ARM Neon support and only build appropriate SIMD
 object files

Add test compilations to detect ARM Neon support to configure.ac and
hts_probe_cc.sh.

If compiler support is present, add rANS_static32x16pr_neon.c to
$(HTSCODECS_SOURCES) in htscodecs_bundled.mk. Fixes #1450.

In htscodecs_bundled.mk, only add rANS_static32x16pr_avx2.c et al
to $(HTSCODECS_SOURCES) if the respective AVX2, AVX512, SSE4 support
is present. As building these files already uses GNU Make-specific
constructs and the $(HTS_CFLAGS_AVX2) variables are either empty or an
option string, this is easily achieved via `$(if $(HTS_CFLAGS_AVX2),...)`.

There is no compiler flag required for Neon, so invent HTS_HAVE_NEON
and use it to control building rANS_static32x16pr_neon.c without adding
any bespoke compilation options for it.
---
 config.mk.in         |  1 +
 configure.ac         | 15 +++++++++++++++
 hts_probe_cc.sh      | 15 +++++++++++++++
 htscodecs_bundled.mk |  7 ++++---
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/config.mk.in b/config.mk.in
index 35392bf0d..82af49850 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -117,3 +117,4 @@ endif
 HTS_CFLAGS_AVX2 = @hts_cflags_avx2@
 HTS_CFLAGS_AVX512 = @hts_cflags_avx512@
 HTS_CFLAGS_SSE4 = @hts_cflags_sse4@
+HTS_HAVE_NEON = @hts_have_neon@
diff --git a/configure.ac b/configure.ac
index c1578d6e4..a53c08fe3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -127,6 +127,21 @@ AX_CHECK_COMPILE_FLAG([-mavx512f], [
     return *((char *) &b);
   ]])])
 
+dnl Detect ARM Neon availability
+AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [
+  AC_COMPILE_IFELSE([
+    AC_LANG_PROGRAM([[
+      #include "arm_neon.h"
+    ]], [[
+      int32x4_t a = vdupq_n_s32(1);
+      int32x4_t b = vaddq_s32(a, a);
+      return *((char *) &b);
+    ]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])])
+if test "$hts_cv_have_neon" = yes; then
+  hts_have_neon=yes
+  AC_SUBST([hts_have_neon])
+fi
+
 
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
 dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index 0f6ddede8..905279099 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -95,4 +95,19 @@ if run_compiler "$FLAGS" ; then
     echo "HTS_CFLAGS_AVX512 = $FLAGS"
 fi
 
+# Check for neon
+
+rm -f conftest.c
+cat - <<'EOF' > conftest.c
+#include "arm_neon.h"
+int main(int argc, char **argv) {
+    int32x4_t a = vdupq_n_s32(1);
+    int32x4_t b = vaddq_s32(a, a);
+    return *((char *) &b);
+}
+EOF
+if run_compiler "" ; then
+    echo "HTS_HAVE_NEON = yes"
+fi
+
 rm -f conftest.c
diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk
index 4a862f3d1..64192f085 100644
--- a/htscodecs_bundled.mk
+++ b/htscodecs_bundled.mk
@@ -28,9 +28,10 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \
         $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \
         $(HTSPREFIX)htscodecs/htscodecs/pack.c \
         $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \
-	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \
-	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \
-	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \
+	$(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \
+	$(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \
+	$(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \
+	$(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \
 	$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \
         $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \
         $(HTSPREFIX)htscodecs/htscodecs/rle.c \

From 58d9f604a0b9de5be021c9e9a79d1dc5d5892503 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 13 Jun 2022 13:54:49 +0100
Subject: [PATCH 37/79] Add an ARM CI test

Tries to be as strict as possible.  Unfortunately Address Sanitizer
appears to be very slow on this platform at the moment, so has
been left out for now.  It would be good to add it later should
its performance improve.
---
 .cirrus.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 2740ce05e..79aa2f99b 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -140,6 +140,33 @@ rocky_task:
   << : *COMPILE
   << : *TEST
 
+# Arm Linux
+arm_ubuntu_task:
+  name: ubuntu-arm
+  arm_container:
+    image: ubuntu:latest
+    cpu: 2
+    memory: 1G
+
+  environment:
+    LC_ALL: C
+    CIRRUS_CLONE_DEPTH: 1
+    DO_UNTRACKED_FILE_CHECK: yes
+    USE_CONFIG: yes
+    CFLAGS: -g -Wall -O3 -std=c99 -pedantic
+
+  # NB: we could consider building a docker image with these
+  # preinstalled and specifying that instead, to speed up testing.
+  install_script: |
+    apt-get update
+    apt-get install -y --no-install-suggests --no-install-recommends     \
+        ca-certificates clang libc-dev make git autoconf automake        \
+        zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \
+        libdeflate-dev
+
+  << : *COMPILE
+  << : *TEST
+
 #--------------------------------------------------
 # Task: macOS builds
 

From 226c1a813bc5d0582f7e0b0bdb4b3ea9e3ee4ce4 Mon Sep 17 00:00:00 2001
From: Martin Pollard <mp15@sanger.ac.uk>
Date: Tue, 21 Jun 2022 18:06:57 +0100
Subject: [PATCH 38/79] Fix breakend detection and test bcf_set_variant_type
 (PR #1456)

---
 .gitignore                       |   1 +
 Makefile                         |   7 +-
 test/test-bcf_set_variant_type.c | 135 +++++++++++++++++++++++++++++++
 test/test.pl                     |  15 ++++
 vcf.c                            |   9 ++-
 5 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 test/test-bcf_set_variant_type.c

diff --git a/.gitignore b/.gitignore
index 1573a5bf7..527bc227f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,7 @@ shlib-exports-*.txt
 /test/tabix/FAIL*
 /test/test-bcf-sr
 /test/test-bcf-translate
+/test/test-bcf_set_variant_type
 /test/test_bgzf
 /test/test_expr
 /test/test_index
diff --git a/Makefile b/Makefile
index c8e394830..540fa58f3 100644
--- a/Makefile
+++ b/Makefile
@@ -94,7 +94,8 @@ BUILT_TEST_PROGRAMS = \
 	test/fuzz/hts_open_fuzzer.o \
 	test/test-bcf-translate \
 	test/test-parse-reg \
-	test/test_introspection
+	test/test_introspection \
+	test/test-bcf_set_variant_type
 
 BUILT_THRASH_PROGRAMS = \
 	test/thrash_threads1 \
@@ -649,6 +650,9 @@ test/test-bcf-translate: test/test-bcf-translate.o libhts.a
 test/test_introspection: test/test_introspection.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread
 
+test/test-bcf_set_variant_type: test/test-bcf_set_variant_type.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf_set_variant_type.o libhts.a $(LIBS) -lpthread
+
 # Extra tests for bundled htscodecs
 test_htscodecs_rans4x8: htscodecs/tests/rans4x8
 	cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test
@@ -723,6 +727,7 @@ test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h)
 test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h)
 test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h)
 test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h)
+test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c
 
 
 test/thrash_threads1: test/thrash_threads1.o libhts.a
diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c
new file mode 100644
index 000000000..fef212dbb
--- /dev/null
+++ b/test/test-bcf_set_variant_type.c
@@ -0,0 +1,135 @@
+/*  test/test-bcf_set_variant_type.c -- bcf_set_variant_type test harness.
+
+    Copyright (C) 2022 Genome Research Ltd.
+
+    Author: Martin Pollard <mp15@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <string.h>
+
+#include "../htslib/hts.h"
+#include "../vcf.c"
+
+void error(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+    if (strrchr(format, '\n') == NULL) fputc('\n', stderr);
+    exit(-1);
+}
+
+static void test_bcf_set_variant_type()
+{
+    // Test SNVs
+    bcf_variant_t var1;
+    bcf_set_variant_type("A", "T", &var1);
+    if ( var1.type != VCF_SNP)
+    {
+        error("A -> T was not detected as a breakend");
+    }
+
+    // Test INDEL
+    bcf_variant_t var2a;
+    bcf_set_variant_type("A", "AA", &var2a);
+    if ( var2a.type != VCF_INDEL)
+    {
+        error("A -> AA was not detected as an INDEL");
+    }
+    bcf_variant_t var2b;
+    bcf_set_variant_type("AA", "A", &var2b);
+    if ( var2b.type != VCF_INDEL)
+    {
+        error("AA -> A was not detected as a INDEL");
+    }
+
+    // Test breakends
+    bcf_variant_t var3a;
+    bcf_set_variant_type("N", "N]16:33625444]", &var3a);
+    if ( var3a.type != VCF_BND)
+    {
+        error("N]16:33625444] was not detected as a breakend");
+    }
+
+    bcf_variant_t var3b;
+    bcf_set_variant_type("N", "N[16:33625444[", &var3b);
+    if (var3b.type != VCF_BND)
+    {
+        error("N[16:33625444[ was not detected as a breakend");
+    }
+
+    bcf_variant_t var3c;
+    bcf_set_variant_type("N", "]16:33625444]N", &var3c);
+    if ( var3c.type != VCF_BND)
+    {
+        error("]16:33625444]N was not detected as a breakend");
+    }
+
+    bcf_variant_t var3d;
+    bcf_set_variant_type("N", "[16:33625444[N", &var3d);
+    if ( var3d.type != VCF_BND)
+    {
+        error("[16:33625444[N was not detected as a breakend");
+    }
+    // Test special reference alleles
+    bcf_variant_t var4a;
+    bcf_set_variant_type("A", "<NON_REF>", &var4a);
+    if ( var4a.type != VCF_REF)
+    {
+        error("<NON_REF> was not detected as a special reference allele");
+    }
+    bcf_variant_t var4b;
+    bcf_set_variant_type("A", "<*>", &var4b);
+    if ( var4b.type != VCF_REF)
+    {
+        error("<*> was not detected as a special reference allele");
+    }
+    // Test MNP
+    bcf_variant_t var5;
+    bcf_set_variant_type("AA", "TT", &var5);
+    if ( var5.type != VCF_MNP)
+    {
+        error("AA->TT was not detected as a MNP");
+    }
+    // Test Overlapping allele
+    bcf_variant_t var6;
+    bcf_set_variant_type("A", "*", &var6);
+    if ( var6.type != VCF_OVERLAP)
+    {
+        error("A->* was not detected as an overlap");
+    }
+    // Test .
+    bcf_variant_t var7;
+    bcf_set_variant_type("A", ".", &var7);
+    if ( var7.type != VCF_REF)
+    {
+        error("A->. was not detected as a special reference allele");
+    }
+}
+
+int main(int argc, char **argv)
+{
+    test_bcf_set_variant_type();
+    return 0;
+}
+
diff --git a/test/test.pl b/test/test.pl
index 7a396e22a..514f2508a 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -60,6 +60,7 @@
 test_logging($opts);
 test_plugin_loading($opts);
 test_realn($opts);
+test_bcf_set_variant_type($opts);
 
 print "\nNumber of tests:\n";
 printf "    total   .. %d\n", $$opts{nok}+$$opts{nfailed};
@@ -1055,3 +1056,17 @@ sub test_realn {
     # Revert quality values (using data in ZQ tags)
     test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam");
 }
+
+sub test_bcf_set_variant_type
+{
+    my ($opts) = @_;
+    my $test = 'test-bcf_set_variant_type';
+    my $cmd  = "$$opts{path}/test-bcf_set_variant_type";
+    print "$test:\n";
+    print "\t$cmd\n";
+    my ($ret,$out) = _cmd($cmd);
+    if ( $ret ) {
+        print $out;
+        failed($opts,$test);
+    } else { passed($opts,$test); }
+}
diff --git a/vcf.c b/vcf.c
index e1d386c7a..ab2477861 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4179,12 +4179,19 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t
         return;
     }
 
+    // Catch "joined before" breakend case
+    if ( alt[0]==']' || alt[0] == '[' )
+    {
+        var->type = VCF_BND; return;
+    }
+
+    // Iterate through alt characters that match the reference
     const char *r = ref, *a = alt;
     while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
 
     if ( *a && !*r )
     {
-        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; }
+        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
         while ( *a ) a++;
         var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
     }

From dd5ee808d58011e58a158c8863aa1e074498a061 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 1 Jul 2022 13:05:47 +0100
Subject: [PATCH 39/79] Don't use `register` in public header file

The `register` storage class specifier has been removed in C++17.
HTSlib itself is a C project so is unaffected, but this header may
be used from third-party C++ projects.
---
 htslib/ksort.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htslib/ksort.h b/htslib/ksort.h
index 755010951..ad19fc47a 100644
--- a/htslib/ksort.h
+++ b/htslib/ksort.h
@@ -88,7 +88,7 @@ typedef struct {
 	int depth;
 } ks_isort_stack_t;
 
-#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; }
 
 #define KSORT_INIT(name, type_t, __sort_lt)	KSORT_INIT_(_ ## name, , type_t, __sort_lt)
 #define KSORT_INIT_STATIC(name, type_t, __sort_lt)	KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt)

From ca34d9e3e7e6e5dd4416fe3cca74cb4088d8888b Mon Sep 17 00:00:00 2001
From: Colin Diesh <colin.diesh@gmail.com>
Date: Mon, 4 Jul 2022 10:07:23 -0600
Subject: [PATCH 40/79] Set tab delimiter in manpage for tabix GFF3 sort (PR
 #1457)

This can help if there are spaces in the GFF3 file e.g. in column 2 or 3
---
 tabix.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tabix.1 b/tabix.1
index 2c012442c..6950593e5 100644
--- a/tabix.1
+++ b/tabix.1
@@ -169,7 +169,7 @@ The default is 3, which turns on error and warning messages;
 Values higher than 3 produce additional informational and debugging messages.
 .PP
 .SH EXAMPLE
-(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
+(grep "^#" in.gff; grep -v "^#" in.gff | sort -t"`printf '\(rst'`" -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
 
 tabix -p gff sorted.gff.gz;
 

From 185521aebda549487cee0124ca20907de2b0fcf5 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 6 Jul 2022 09:52:09 +0100
Subject: [PATCH 41/79] Improve error message when failing to load an index.

If errno is non-zero we now use strerror to report the system error
message when reporting on failure to load an index.

See samtools/samtools#1637 for an example where we believe this would
aid the user to diagnose the problem.
---
 hts.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hts.c b/hts.c
index 0e9257e76..c2c4acb89 100644
--- a/hts.c
+++ b/hts.c
@@ -4588,7 +4588,9 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
 
     hts_idx_t *idx = idx_read(fnidx);
     if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
-        hts_log_error("Could not load local index file '%s'", fnidx);
+        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
+                      errno ? " : " : "", errno ? strerror(errno) : "");
+
 
     free(local_fnidx);
 

From b0148048382a62113e08e9954dca7c008df3db63 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 12 Apr 2022 17:37:59 +0100
Subject: [PATCH 42/79] Make hfile_s3 refresh AWS credentials on expiry

This is to make HTSlib work better with AWS IAM credentials,
which have a limited lifespan, and so may need to be refreshed.
To allow this, hfile_s3 is made to look for an unofficial
'expiry_time' entry in the AWS_SHARED_CREDENTIALS_FILE.  If
present, the file will be re-read if the current time is within
one minute of the given expiry (new credentails are available
five minutes before expiry, according to
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html).

Currently no effort is made to understand the JSON format
emitted by the AWS security-credentials endpoint.  It's up
to the end user to reformat this into the style of the normal
'.aws/credentials' file.  An example of how this can be done
for one source of credentials on AWS is added to the manual
page.

Fixes bug where parse_ini would append to rather than replace
existing values.

Moves x-amz-security-token to the set of headers updated
via callback, as it can now change when the credentials
are updated.

Includes an implementation of the timegm() function, which
is not portable (e.g. mingw doesn't have it) but needed to convert
the expiry time to a time_t.  This is put in a separate header
so that it can be more easily reused elsewhere if we want.
Includes tests to check that details like leap years and
normalisation work properly.
---
 .gitignore             |   1 +
 Makefile               |   9 +-
 hfile_s3.c             | 214 ++++++++++++++++++++++++++++++-----------
 hts_time_funcs.h       | 167 ++++++++++++++++++++++++++++++++
 htslib-s3-plugin.7     |  80 ++++++++++++++-
 test/test_time_funcs.c | 122 +++++++++++++++++++++++
 6 files changed, 534 insertions(+), 59 deletions(-)
 create mode 100644 hts_time_funcs.h
 create mode 100644 test/test_time_funcs.c

diff --git a/.gitignore b/.gitignore
index 527bc227f..6b58e8439 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,7 @@ shlib-exports-*.txt
 /test/test_realn
 /test/test-regidx
 /test/test_str2int
+/test/test_time_funcs
 /test/test-vcf-api
 /test/test-vcf-sweep
 /test/test_view
diff --git a/Makefile b/Makefile
index 540fa58f3..621e8a11a 100644
--- a/Makefile
+++ b/Makefile
@@ -86,6 +86,7 @@ BUILT_TEST_PROGRAMS = \
 	test/test_realn \
 	test/test-regidx \
 	test/test_str2int \
+	test/test_time_funcs \
 	test/test_view \
 	test/test_index \
 	test/test-vcf-api \
@@ -234,6 +235,7 @@ bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h)
 header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h)
 hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h)
 hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h)
+hts_time_funcs_h = hts_time_funcs.h config.h
 sam_internal_h = sam_internal.h $(htslib_sam_h)
 textutils_internal_h = textutils_internal.h $(htslib_kstring_h)
 thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h)
@@ -421,7 +423,7 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts
 hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h)
 hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h)
 hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h)
-hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
+hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h)
 hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h)
 hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h)
 hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
@@ -563,6 +565,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC
 	test/test_kfunc
 	test/test_kstring
 	test/test_str2int
+	test/test_time_funcs
 	test/fieldarith test/fieldarith.sam
 	test/hfile
 	HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR)
@@ -629,6 +632,9 @@ test/test-parse-reg: test/test-parse-reg.o libhts.a
 test/test_str2int: test/test_str2int.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_str2int.o libhts.a $(LIBS) -lpthread
 
+test/test_time_funcs: test/test_time_funcs.o
+	$(CC) $(LDFLAGS) -o $@ test/test_time_funcs.o
+
 test/test_view: test/test_view.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread
 
@@ -720,6 +726,7 @@ test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_s
 test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
 test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
 test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h)
+test/test_time_funcs.o: test/test_time_funcs.c $(htslib_time_funcs_h)
 test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h)
 test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h)
 test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h)
diff --git a/hfile_s3.c b/hfile_s3.c
index e8e505e2a..c9bed1fe1 100644
--- a/hfile_s3.c
+++ b/hfile_s3.c
@@ -40,6 +40,7 @@ DEALINGS IN THE SOFTWARE.  */
 #endif
 #include "htslib/hts.h"  // for hts_version() and hts_verbose
 #include "htslib/kstring.h"
+#include "hts_time_funcs.h"
 
 typedef struct s3_auth_data {
     kstring_t id;
@@ -49,6 +50,8 @@ typedef struct s3_auth_data {
     kstring_t canonical_query_string;
     kstring_t user_query_string;
     kstring_t host;
+    kstring_t profile;
+    time_t creds_expiry_time;
     char *bucket;
     kstring_t auth_hdr;
     time_t auth_time;
@@ -57,11 +60,12 @@ typedef struct s3_auth_data {
     char date_short[9];
     kstring_t date_html;
     char mode;
-    char *headers[4];
+    char *headers[5];
     int refcount;
 } s3_auth_data;
 
-#define AUTH_LIFETIME 60
+#define AUTH_LIFETIME 60  // Regenerate auth headers if older than this
+#define CREDENTIAL_LIFETIME 60 // Seconds before expiry to reread credentials
 
 #if defined HAVE_COMMONCRYPTO
 
@@ -235,7 +239,10 @@ static void parse_ini(const char *fname, const char *section, ...)
             va_start(args, section);
             while ((akey = va_arg(args, const char *)) != NULL) {
                 kstring_t *avar = va_arg(args, kstring_t *);
-                if (strcmp(key, akey) == 0) { kputs(value, avar); break; }
+                if (strcmp(key, akey) == 0) {
+                    avar->l = 0;
+                    kputs(value, avar);
+                    break; }
             }
             va_end(args);
         }
@@ -270,17 +277,37 @@ static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret)
 
 static int copy_auth_headers(s3_auth_data *ad, char ***hdrs) {
     char **hdr = &ad->headers[0];
+    int idx = 0;
     *hdrs = hdr;
-    *hdr = strdup(ad->date);
-    if (!*hdr) return -1;
-    hdr++;
+
+    hdr[idx] = strdup(ad->date);
+    if (!hdr[idx]) return -1;
+    idx++;
+
+    if (ad->token.l) {
+        kstring_t token_hdr = KS_INITIALIZE;
+        kputs("X-Amz-Security-Token: ", &token_hdr);
+        kputs(ad->token.s, &token_hdr);
+        if (token_hdr.s) {
+            hdr[idx++] = token_hdr.s;
+        } else {
+            goto fail;
+        }
+    }
+
     if (ad->auth_hdr.l) {
-        *hdr = strdup(ad->auth_hdr.s);
-        if (!*hdr) { free(ad->headers[0]); return -1; }
-        hdr++;
+        hdr[idx] = strdup(ad->auth_hdr.s);
+        if (!hdr[idx]) goto fail;
+        idx++;
     }
-    *hdr = NULL;
+
+    hdr[idx] = NULL;
     return 0;
+
+ fail:
+    for (--idx; idx >= 0; --idx)
+        free(hdr[idx]);
+    return -1;
 }
 
 static void free_auth_data(s3_auth_data *ad) {
@@ -288,6 +315,7 @@ static void free_auth_data(s3_auth_data *ad) {
         --ad->refcount;
         return;
     }
+    free(ad->profile.s);
     free(ad->id.s);
     free(ad->token.s);
     free(ad->secret.s);
@@ -301,6 +329,67 @@ static void free_auth_data(s3_auth_data *ad) {
     free(ad);
 }
 
+static time_t parse_rfc3339_date(kstring_t *datetime)
+{
+    int offset = 0;
+    time_t when;
+    int num;
+    char should_be_t = '\0', timezone[10] = { '\0' };
+    unsigned int year, mon, day, hour, min, sec;
+
+    if (!datetime->s)
+        return 0;
+
+    // It should be possible to do this with strptime(), but it seems
+    // to not get on with our feature definitions.
+    num = sscanf(datetime->s, "%4u-%2u-%2u%c%2u:%2u:%2u%9s",
+                 &year, &mon, &day, &should_be_t, &hour, &min, &sec, timezone);
+    if (num < 8)
+        return 0;
+    if (should_be_t != 'T' && should_be_t != 't' && should_be_t != ' ')
+        return 0;
+    struct tm parsed = { sec, min, hour, day, mon - 1, year - 1900, 0, 0, 0 };
+
+    switch (timezone[0]) {
+      case 'Z':
+      case 'z':
+      case '\0':
+          break;
+      case '+':
+      case '-': {
+          unsigned hr_off, min_off;
+          if (sscanf(timezone + 1, "%2u:%2u", &hr_off, &min_off)) {
+              if (hr_off < 24 && min_off <= 60) {
+                  offset = ((hr_off * 60 + min_off)
+                            * (timezone[0] == '+' ? -60 : 60));
+              }
+          }
+          break;
+      }
+      default:
+          return 0;
+    }
+
+    when = hts_time_gm(&parsed);
+    return when >= 0 ? when + offset : 0;
+}
+
+static void refresh_auth_data(s3_auth_data *ad) {
+    // Basically a copy of the AWS_SHARED_CREDENTIALS_FILE part of
+    // setup_auth_data(), but this only reads the authorisation parts.
+    const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE");
+    kstring_t expiry_time = KS_INITIALIZE;
+    parse_ini(v? v : "~/.aws/credentials", ad->profile.s,
+              "aws_access_key_id", &ad->id,
+              "aws_secret_access_key", &ad->secret,
+              "aws_session_token", &ad->token,
+              "expiry_time", &expiry_time);
+    if (expiry_time.l) {
+        ad->creds_expiry_time = parse_rfc3339_date(&expiry_time);
+    }
+    ks_free(&expiry_time);
+}
+
 static int auth_header_callback(void *ctx, char ***hdrs) {
     s3_auth_data *ad = (s3_auth_data *) ctx;
 
@@ -320,7 +409,10 @@ static int auth_header_callback(void *ctx, char ***hdrs) {
         return 0;
     }
 
-    if (now - ad->auth_time < AUTH_LIFETIME) {
+    if (ad->creds_expiry_time > 0
+        && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) {
+        refresh_auth_data(ad);
+    } else if (now - ad->auth_time < AUTH_LIFETIME) {
         // Last auth string should still be valid
         *hdrs = NULL;
         return 0;
@@ -499,7 +591,6 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
     s3_auth_data *ad = calloc(1, sizeof(*ad));
     const char *bucket, *path;
     char *escaped = NULL;
-    kstring_t profile = { 0, 0, NULL };
     size_t url_path_pos;
     ptrdiff_t bucket_len;
     int is_https = 1, dns_compliant;
@@ -532,7 +623,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
     if (*path == '@') {
         const char *colon = strpbrk(bucket, ":@");
         if (*colon != ':') {
-            urldecode_kput(bucket, colon - bucket, &profile);
+            urldecode_kput(bucket, colon - bucket, &ad->profile);
         }
         else {
             const char *colon2 = strpbrk(&colon[1], ":@");
@@ -554,9 +645,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
         if ((v = getenv("AWS_DEFAULT_REGION")) != NULL) kputs(v, &ad->region);
         if ((v = getenv("HTS_S3_HOST")) != NULL) kputs(v, &ad->host);
 
-        if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile);
-        else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile);
-        else kputs("default", &profile);
+        if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &ad->profile);
+        else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &ad->profile);
+        else kputs("default", &ad->profile);
 
         if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) {
             if (strcasecmp(v, "virtual") == 0) {
@@ -569,13 +660,15 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
 
     if (ad->id.l == 0) {
         kstring_t url_style = KS_INITIALIZE;
+        kstring_t expiry_time = KS_INITIALIZE;
         const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE");
-        parse_ini(v? v : "~/.aws/credentials", profile.s,
+        parse_ini(v? v : "~/.aws/credentials", ad->profile.s,
                   "aws_access_key_id", &ad->id,
                   "aws_secret_access_key", &ad->secret,
                   "aws_session_token", &ad->token,
                   "region", &ad->region,
                   "addressing_style", &url_style,
+                  "expiry_time", &expiry_time,
                   NULL);
 
         if (url_style.l) {
@@ -587,14 +680,23 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
                 address_style = s3_auto;
             }
         }
+        if (expiry_time.l) {
+            // Not a real part of the AWS configuration file, but it allows
+            // support for short-term credentials like those for the IAM
+            // service.  The botocore library uses the key "expiry_time"
+            // internally for this purpose.
+            // See https://github.com/boto/botocore/blob/develop/botocore/credentials.py
+            ad->creds_expiry_time = parse_rfc3339_date(&expiry_time);
+        }
 
         ks_free(&url_style);
+        ks_free(&expiry_time);
     }
 
     if (ad->id.l == 0) {
         kstring_t url_style = KS_INITIALIZE;
         const char *v = getenv("HTS_S3_S3CFG");
-        parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id,
+        parse_ini(v? v : "~/.s3cfg", ad->profile.s, "access_key", &ad->id,
                   "secret_key", &ad->secret, "access_token", &ad->token,
                   "host_base", &ad->host,
                   "bucket_location", &ad->region,
@@ -699,13 +801,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
         *query_start = 0;
     }
 
-    free(profile.s);
     free(escaped);
 
     return ad;
 
  error:
-    free(profile.s);
     free(escaped);
     free_auth_data(ad);
     return NULL;
@@ -713,23 +813,13 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
 
 static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp)
 {
-    char *header_list[4], **header = header_list;
-
     kstring_t url = { 0, 0, NULL };
-    kstring_t token_hdr = { 0, 0, NULL };
     s3_auth_data *ad = setup_auth_data(s3url, mode, 2, &url);
 
     if (!ad)
         return NULL;
 
-    if (ad->token.l > 0) {
-        kputs("X-Amz-Security-Token: ", &token_hdr);
-        kputs(ad->token.s, &token_hdr);
-        *header++ = token_hdr.s;
-    }
-
-    *header = NULL;
-    hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list,
+    hFILE *fp = hopen(url.s, mode, "va_list", argsp,
                       "httphdr_callback", auth_header_callback,
                       "httphdr_callback_data", ad,
                       "redirect_callback", redirect_endpoint_callback,
@@ -738,12 +828,10 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp)
     if (!fp) goto fail;
 
     free(url.s);
-    free(token_hdr.s);
     return fp;
 
  fail:
     free(url.s);
-    free(token_hdr.s);
     free_auth_data(ad);
     return NULL;
 }
@@ -895,9 +983,8 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten
 }
 
 
-static int update_time(s3_auth_data *ad) {
+static int update_time(s3_auth_data *ad, time_t now) {
     int ret = -1;
-    time_t now = time(NULL);
 #ifdef HAVE_GMTIME_R
     struct tm tm_buffer;
     struct tm *tm = gmtime_r(&now, &tm_buffer);
@@ -988,6 +1075,7 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co
                                         kstring_t *token, int uqs) {
     s3_auth_data *ad = (s3_auth_data *)auth;
     char content_hash[HASH_LENGTH_SHA256];
+    time_t now;
 
     if (request == NULL) {
         // signal to free auth data
@@ -995,9 +1083,15 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co
         return 0;
     }
 
-    if (update_time(ad)) {
+    now = time(NULL);
+
+    if (update_time(ad, now)) {
         return -1;
     }
+    if (ad->creds_expiry_time > 0
+        && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) {
+        refresh_auth_data(ad);
+    }
 
     if (content) {
         hash_string(content->s, content->l, content_hash);
@@ -1045,19 +1139,29 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co
 static int v4_auth_header_callback(void *ctx, char ***hdrs) {
     s3_auth_data *ad = (s3_auth_data *) ctx;
     char content_hash[HASH_LENGTH_SHA256];
-    kstring_t content = {0, 0, NULL};
-    kstring_t authorisation = {0, 0, NULL};
+    kstring_t content = KS_INITIALIZE;
+    kstring_t authorisation = KS_INITIALIZE;
+    kstring_t token_hdr = KS_INITIALIZE;
     char *date_html = NULL;
+    time_t now;
+    int idx;
 
     if (!hdrs) { // Closing connection
         free_auth_data(ad);
         return 0;
     }
 
-    if (update_time(ad)) {
+    now = time(NULL);
+
+    if (update_time(ad, now)) {
         return -1;
     }
 
+    if (ad->creds_expiry_time > 0
+        && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) {
+        refresh_auth_data(ad);
+    }
+
     if (!ad->id.l || !ad->secret.l) {
         return copy_auth_headers(ad, hdrs);
     }
@@ -1083,18 +1187,27 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) {
     ksprintf(&content, "x-amz-content-sha256: %s", content_hash);
     date_html = strdup(ad->date_html.s);
 
+    if (ad->token.l > 0) {
+        kputs("X-Amz-Security-Token: ", &token_hdr);
+        kputs(ad->token.s, &token_hdr);
+    }
+
     if (content.l == 0 || date_html == NULL) {
         ksfree(&authorisation);
         ksfree(&content);
+        ksfree(&token_hdr);
         free(date_html);
         return -1;
     }
 
     *hdrs = &ad->headers[0];
-    ad->headers[0] = ks_release(&authorisation);
-    ad->headers[1] = date_html;
-    ad->headers[2] = ks_release(&content);
-    ad->headers[3] = NULL;
+    idx = 0;
+    ad->headers[idx++] = ks_release(&authorisation);
+    ad->headers[idx++] = date_html;
+    ad->headers[idx++] = ks_release(&content);
+    if (token_hdr.s)
+        ad->headers[idx++] = ks_release(&token_hdr);
+    ad->headers[idx++] = NULL;
 
     return 0;
 }
@@ -1167,9 +1280,7 @@ static int http_status_errno(int status)
 
 static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
     kstring_t url = { 0, 0, NULL };
-    kstring_t token_hdr = { 0, 0, NULL };
 
-    char *header_list[4], **header = header_list;
     s3_auth_data *ad = setup_auth_data(s3url, mode, 4, &url);
     hFILE *fp = NULL;
 
@@ -1180,14 +1291,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
     if (ad->mode == 'r') {
         long http_response = 0;
 
-        if (ad->token.l > 0) {
-            kputs("x-amz-security-token: ", &token_hdr);
-            kputs(ad->token.s, &token_hdr);
-            *header++ = token_hdr.s;
-        }
-
-        *header = NULL;
-        fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list,
+        fp = hopen(url.s, mode, "va_list", argsp,
                    "httphdr_callback", v4_auth_header_callback,
                    "httphdr_callback_data", ad,
                    "redirect_callback", redirect_endpoint_callback,
@@ -1204,7 +1308,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
                 goto error;
             }
             hclose_abruptly(fp);
-            fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list,
+            fp = hopen(url.s, mode, "va_list", argsp,
                        "httphdr_callback", v4_auth_header_callback,
                        "httphdr_callback_data", ad,
                        "redirect_callback", redirect_endpoint_callback,
@@ -1237,7 +1341,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
     }
 
     free(url.s);
-    free(token_hdr.s);
 
     return fp;
 
@@ -1245,7 +1348,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
 
     if (fp) hclose_abruptly(fp);
     free(url.s);
-    free(token_hdr.s);
     free_auth_data(ad);
 
     return NULL;
diff --git a/hts_time_funcs.h b/hts_time_funcs.h
new file mode 100644
index 000000000..bc3de14f8
--- /dev/null
+++ b/hts_time_funcs.h
@@ -0,0 +1,167 @@
+/*  hts_time_funcs.h -- Implementations of non-standard time functions
+
+    Copyright (C) 2022 Genome Research Ltd.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+/*
+  This mainly exists because timegm() is not a standard function, and so
+  Cannot be used in portable code.  Unfortunately the standard one (mktime)
+  always takes the local timezone into accout so doing a UTC conversion
+  with it involves changing the TZ environment variable, which is rather
+  messy and not likely to go well with threaded code.
+
+  The code here is a much simplified version of the BSD timegm() implementation.
+  It currently rejects dates before 1970, avoiding problems with -ve time_t.
+  It also works strictly in UTC, so doesn't have to worry about tm_isdst
+  which makes the calculation much easier.
+
+  Some of this is derived from BSD sources, for example
+  https://github.com/NetBSD/src/blob/trunk/lib/libc/time/localtime.c
+  which state:
+
+  ** This file is in the public domain, so clarified as of
+  ** 1996-06-05 by Arthur David Olson.
+
+  Non-derived code is copyright as above.
+*/
+
+#include <config.h>
+
+static inline int hts_time_normalise(int *tens, int *units, int base) {
+    if (*units < 0 || *units >= base) {
+        int delta = *units >= 0 ? *units / base : (-1 - (-1 - *units) / base);
+        int64_t tmp = (int64_t) (*tens) + delta;
+        if (tmp < INT_MIN || tmp > INT_MAX) return 1;
+        *tens = tmp;
+        *units -= delta * base;
+    }
+    return 0;
+}
+
+static inline int hts_year_is_leap(int64_t year) {
+    return ((year % 4 == 0) && (year % 100 != 0)) || (year % 400 == 0);
+}
+
+// Number of leap years to start of year
+// Only works for year >= 1.
+static inline int64_t hts_leaps_to_year_start(int64_t year) {
+    --year;
+    return year / 4 - year / 100 + year / 400;
+}
+
+static inline int hts_time_normalise_tm(struct tm *t)
+{
+    const int days_per_mon[2][12] = {
+        { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
+        { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }
+    };
+    const int year_days[2] = { 365, 366 };
+    int overflow = 0;
+    int64_t year;
+
+    if (t->tm_sec > 62) {
+        overflow |= hts_time_normalise(&t->tm_min, &t->tm_sec, 60);
+    }
+    overflow |= hts_time_normalise(&t->tm_hour, &t->tm_min,  60);
+    overflow |= hts_time_normalise(&t->tm_mday, &t->tm_hour, 24);
+    overflow |= hts_time_normalise(&t->tm_year, &t->tm_mon,  12);
+    if (overflow)
+        return 1;
+
+    year = (int64_t) t->tm_year + 1900LL;
+    while (t->tm_mday <= 0) {
+        --year;
+        t->tm_mday += year_days[hts_year_is_leap(year + (1 < t->tm_mon))];
+    }
+    while (t->tm_mday > 366) {
+        t->tm_mday -= year_days[hts_year_is_leap(year + (1 < t->tm_mon))];
+        ++year;
+    }
+    for (;;) {
+        int mdays = days_per_mon[hts_year_is_leap(year)][t->tm_mon];
+        if (t->tm_mday <= mdays)
+            break;
+        t->tm_mday -= mdays;
+        t->tm_mon++;
+        if (t->tm_mon >= 12) {
+            year++;
+            t->tm_mon = 0;
+        }
+    }
+    year -= 1900;
+    if (year != t->tm_year) {
+        if (year < INT_MIN || year > INT_MAX)
+            return 1;
+        t->tm_year = year;
+    }
+    return 0;
+}
+
+/**
+ *  Convert broken-down time to an equivalent time_t value
+ *  @param target  Target broken-down time structure
+ *  @return Equivalent time_t value on success; -1 on failure
+ *
+ *  This function first normalises the time in @p target so that the
+ *  structure members are in the valid range.  It then calculates the
+ *  number of seconds (ignoring leap seconds) between midnight Jan 1st 1970
+ *  and the target date.
+ *
+ *  If @p target is outside the range that can be represented in a time_t,
+ *  or tm_year is less than 70 (which would return a negative value) then
+ *  it returns -1 and sets errno to EOVERFLOW.
+ */
+
+static inline time_t hts_time_gm(struct tm *target)
+{
+    int month_start[2][12] = {
+        { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 },
+        { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335 }
+    };
+    int years_from_epoch, leaps, days;
+    int64_t secs;
+
+    if (hts_time_normalise_tm(target) != 0)
+        goto overflow;
+
+    if (target->tm_year < 70)
+        goto overflow;
+
+    years_from_epoch = target->tm_year - 70;
+    leaps = (hts_leaps_to_year_start(target->tm_year + 1900)
+        - hts_leaps_to_year_start(1970));
+    days = ((365 * (years_from_epoch - leaps) + 366 * leaps)
+        + month_start[hts_year_is_leap(target->tm_year + 1900)][target->tm_mon]
+        + target->tm_mday - 1);
+    secs = ((int64_t) days * 86400LL
+        + target->tm_hour * 3600
+        + target->tm_min * 60
+        + target->tm_sec);
+    if (sizeof(time_t) < 8 && secs > INT_MAX)
+        goto overflow;
+
+    return (time_t) secs;
+
+ overflow:
+    errno = EOVERFLOW;
+    return (time_t) -1;
+}
diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7
index 279661053..359c0fc35 100644
--- a/htslib-s3-plugin.7
+++ b/htslib-s3-plugin.7
@@ -24,6 +24,21 @@ s3 plugin \- htslib AWS S3 plugin
 .\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 .\" DEALINGS IN THE SOFTWARE.
 .\"
+.
+.\" For code blocks and examples (cf groff's Ultrix-specific man macros)
+.de EX
+
+.  in +\\$1
+.  nf
+.  ft CR
+..
+.de EE
+.  ft
+.  fi
+.  in
+
+..
+
 .SH DESCRIPTION
 The S3 plugin allows htslib file functions to communicate with servers that use
 the AWS S3 protocol.  Files are identified by their bucket and object key in a
@@ -114,14 +129,73 @@ files will be used.  The default file locations are either
 \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order).
 
 Entries used in aws style credentials file are aws_access_key_id, 
-aws_secret_access_key, aws_session_token, region and addressing_style.  Only the
-first two are usually needed.
+aws_secret_access_key, aws_session_token, region, addressing_style and
+expiry_time (unofficial, see SHORT-LIVED CREDENTIALS below).
+Only the first two are usually needed.
 
 Entries used in s3cmd style config files are access_key, secret_key,
 access_token, host_base, bucket_location and host_bucket. Again only the first
 two are usually needed. The host_bucket option is only used to set a path-style
 URL, see below.
 
+.SH SHORT-LIVED CREDENTIALS
+
+Some cloud identity and access management (IAM) systems can make short-lived
+credentials that allow access to resources.
+These credentials will expire after a time and need to be renewed to
+give continued access.
+To enable this, the S3 plugin allows an \fIexpiry_time\fR entry to be set in the
+\fI.aws/credentials\fR file.
+The value for this entry should be the time when the token expires,
+following the format in RFC3339 section 5.6, which takes the form:
+
+   2012-04-29T05:20:48Z
+
+That is, year - month - day, the letter "T", hour : minute : second.
+The time can be followed by the letter "Z", indicating the UTC timezone,
+or an offset from UTC which is a "+" or "-" sign followed by two digits for
+the hours offset, ":", and two digits for the minutes.
+
+The S3 plugin will attempt to re-read the credentials file up to 1 minute
+before the given expiry time, which means the file needs to be updated with
+new credentials before then.
+As the exact way of doing this can vary between services and IAM providers,
+the S3 plugin expects this to be done by an external user-supplied process.
+This may be achieved by running a program that replaces the file as new
+credentials become available.
+The following script shows how it might be done for AWS instance credentials:
+.EX 2
+#!/bin/sh
+instance='http://169.254.169.254'
+tok_url="$instance/latest/api/token"
+ttl_hdr='X-aws-ec2-metadata-token-ttl-seconds: 10'
+creds_url="$instance/latest/meta-data/iam/security-credentials"
+key1='aws_access_key_id = \(rs(.AccessKeyId)\(rsn'
+key2='aws_secret_access_key = \(rs(.SecretAccessKey)\(rsn'
+key3='aws_session_token = \(rs(.Token)\(rsn'
+key4='expiry_time = \(rs(.Expiration)\(rsn'
+while true; do
+    token=`curl -X PUT -H "$ttl_hdr" "$tok_url"`
+    tok_hdr="X-aws-ec2-metadata-token: $token"
+    role=`curl -H "$tok_hdr" "$creds_url/"`
+    expires='now'
+    ( curl -H "$tok_hdr" "$creds_url/$role" \(rs
+      | jq -r "\(rs"${key1}${key2}${key3}${key4}\(rs"" > credentials.new ) \(rs
+      && mv -f credentials.new credentials \(rs
+      && expires=`grep expiry_time credentials | cut -d ' ' -f 3-`
+    if test $? -ne 0 ; then break ; fi
+    expiry=`date -d "$expires - 3 minutes" '+%s'`
+    now=`date '+%s'`
+    test "$expiry" -gt "$now" && sleep $((($expiry - $now) / 2))
+    sleep 30
+done
+.EE
+
+Note that the \fIexpiry_time\fR key is currently only supported for the
+\fI.aws/credentials\fR file (or the file referred to in the
+.B AWS_SHARED_CREDENTIALS_FILE
+environment variable).
+
 .SH NOTES
 In most cases this plugin transforms the given URL into a virtual host-style
 format e.g. \fIhttps://bucket.host/path/to/file\fR.  A path-style format is used
@@ -136,4 +210,6 @@ host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string.
 .BR htsfile (1)
 .BR samtools (1)
 .PP
+RFC 3339: <https://www.rfc-editor.org/rfc/rfc3339#section-5.6>
+.PP
 htslib website: <http://www.htslib.org/>
diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c
new file mode 100644
index 000000000..e8c2600cc
--- /dev/null
+++ b/test/test_time_funcs.c
@@ -0,0 +1,122 @@
+/*  test_time_compat.c -- Test time functions
+
+    Copyright (C) 2022 Genome Research Ltd.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <errno.h>
+#include <time.h>
+
+#include "../hts_time_funcs.h"
+
+int test_normalised(time_t start, time_t end, time_t incr) {
+    time_t i, j;
+    struct tm *utc;
+
+    for (i = start; i < end; i += incr) {
+        utc = gmtime(&i);
+        j = hts_time_gm(utc);
+        if (i != j) {
+            fprintf(stderr,
+                    "hts_time_gm() failed, got %"PRId64" expected %"PRId64"\n",
+                    (int64_t) j, (int64_t) i);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int test_specific(int year, int mon, int mday, int hour, int min, int sec,
+                  time_t expected) {
+    struct tm utc = { sec, min, hour, mday, mon - 1, year - 1900, 0, 0, 0 };
+    time_t res = hts_time_gm(&utc);
+    if (res != expected) {
+        fprintf(stderr,
+                "hts_time_gm() failed for %4d/%02d/%02d %02d:%02d:%02d :"
+                " got %"PRId64" expected %"PRId64"\n",
+                year, mon, mday, hour, min, sec,
+                (int64_t) res, (int64_t) expected);
+        return 1;
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    int res = 0;
+
+    if (test_normalised(0, INT_MAX - 1000, 1000) != 0)
+        return EXIT_FAILURE;
+    if (sizeof(time_t) >= 8) {
+        if (test_normalised(INT_MAX - 1000, (time_t) INT_MAX * 2, 1000) != 0)
+            return EXIT_FAILURE;
+    }
+
+    // 2022-06-14 12:32:10
+    res |= test_specific(2022, 6, 14, 12, 32, 10, 1655209930);
+    // 2022-06-14 12:32:10
+    res |= test_specific(1993, 9, 10514, 12, 32, 10, 1655209930);
+    // 2022-02-28 12:00:00
+    res |= test_specific(2020, 2, 28, 12, 0, 0, 1582891200);
+    // 2022-02-29 12:00:00
+    res |= test_specific(2020, 2, 29, 12, 0, 0, 1582977600);
+    // 2022-03-01 12:00:00
+    res |= test_specific(2020, 2, 30, 12, 0, 0, 1583064000);
+    // 2022-02-29 12:00:00
+    res |= test_specific(2020, 3, 0, 12, 0, 0, 1582977600);
+    // 2020-02-01 12:00:00
+    res |= test_specific(2019, 14, 1, 12, 0, 0, 1580558400);
+    // 2020-03-01 12:00:00
+    res |= test_specific(2019, 15, 1, 12, 0, 0, 1583064000);
+    // 2021-03-01 12:00:00
+    res |= test_specific(2019, 27, 1, 12, 0, 0, 1614600000);
+    // 2024-02-01 12:00:00
+    res |= test_specific(2019, 62, 1, 12, 0, 0, 1706788800);
+    // 2024-03-01 12:00:00
+    res |= test_specific(2019, 63, 1, 12, 0, 0, 1709294400);
+    // 2020-12-31 23:59:59
+    res |= test_specific(2021, 0, 31, 23, 59, 59, 1609459199);
+    // 2020-03-01 12:00:00
+    res |= test_specific(2021, -9, 1, 12, 0, 0, 1583064000);
+    // 2020-02-01 12:00:00
+    res |= test_specific(2021, -10, 1, 12, 0, 0, 1580558400);
+    // 2019-02-01 12:00:00
+    res |= test_specific(2021, -22, 1, 12, 0, 0, 1549022400);
+    // 1970-01-01 00:00:00
+    res |= test_specific(1970, 1, 1, 0, 0, 0, 0);
+    // 2038-01-19 03:14:07
+    res |= test_specific(1970, 1, 1, 0, 0, INT_MAX, INT_MAX);
+    // 2038-01-19 03:14:07
+    res |= test_specific(2038, 1, 19, 3, 14, 7, INT_MAX);
+    if (sizeof(time_t) < 8) {
+        // 2038-01-19 03:14:08
+        res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1);
+    } else {
+        // 2038-01-19 03:14:08
+        res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) INT_MAX + 1);
+    }
+
+    return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From 506f47913cbc8ffbc5cdfb28218b00b84e23de80 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 6 Jul 2022 16:26:09 +0100
Subject: [PATCH 43/79] Further improve MM tag consistency checking.

If we have an MM tag with base-type specific coordinates beyond the
end of the sequence as there are too few bases of that type, then we
now detect this within bam_parse_basemod.

This was already checked within bam_next_basemod for forward reads,
but not spotted in reverse complemented ones.

Fixes #1466
---
 sam.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sam.c b/sam.c
index 64c08a43a..a84ac2eaa 100644
--- a/sam.c
+++ b/sam.c
@@ -6289,6 +6289,11 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
                 state->MLstride [mod_num] = stride;
                 state->implicit [mod_num] = implicit;
 
+                if (delta < 0) {
+                    hts_log_error("MM tag refers to bases beyond sequence "
+                                  "length");
+                    return -1;
+                }
                 state->MMcount  [mod_num] = delta;
                 if (b->core.flag & BAM_FREVERSE) {
                     state->MM   [mod_num] = cp+1;

From ff49203760f02f5cf4ede7d045d509970f2f8063 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 10:34:23 +0100
Subject: [PATCH 44/79] Fix and_expr expression bug

Prevent old is_true values from being carried over, which could
cause incorrect results from '&&' expressions.
---
 hts_expr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hts_expr.c b/hts_expr.c
index 599d7a54a..23f7c402b 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -622,8 +622,8 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
                     char *str, char **end, hts_expr_val_t *res) {
     if (eq_expr(filt, data, fn, str, end, res)) return -1;
 
-    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
     for (;;) {
+        hts_expr_val_t val = HTS_EXPR_VAL_INIT;
         str = ws(*end);
         if (str[0] == '&' && str[1] == '&') {
             if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
@@ -640,8 +640,8 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         } else {
             break;
         }
+        hts_expr_val_free(&val);
     }
-    hts_expr_val_free(&val);
 
     return 0;
 }

From 4f9a8ba866bfdcd10cf77d1be2a307f85d28d608 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 11:09:30 +0100
Subject: [PATCH 45/79] Add truth checks to test_expr, and test empty-but-true
 strings

---
 test/test_expr.c | 232 +++++++++++++++++++++++++----------------------
 1 file changed, 124 insertions(+), 108 deletions(-)

diff --git a/test/test_expr.c b/test/test_expr.c
index 606a9b3b5..237b4e1ce 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -51,6 +51,12 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) {
         *end = str+5;
         res->is_str = 1;
         kputs("plugh", ks_clear(&res->s));
+    } else if (strncmp(str, "empty-but-true", 14) == 0) {
+        // empty string
+        *end = str+14;
+        res->is_true = 1;
+        res->is_str = 1;
+        kputs("", ks_clear(&res->s));
     } else if (strncmp(str, "empty", 5) == 0) {
         // empty string
         *end = str+5;
@@ -70,6 +76,7 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) {
 }
 
 typedef struct {
+    int truth_val;
     double dval;
     char *sval;
     char *str;
@@ -78,108 +85,112 @@ typedef struct {
 int test(void) {
     // These are all valid expressions that should work
     test_ev tests[] = {
-        {  1, NULL, "1"},
-        {  1, NULL, "+1"},
-        { -1, NULL, "-1"},
-        {  0, NULL, "!7"},
-        {  1, NULL, "!0"},
-        {  1, NULL, "!(!7)"},
-        {  1, NULL, "!!7"},
-
-        {  5, NULL, "2+3"},
-        { -1, NULL, "2+-3"},
-        {  6, NULL, "1+2+3"},
-        {  1, NULL, "-2+3"},
-
-        {  6, NULL, "2*3"},
-        {  6, NULL, "1*2*3"},
-        {  0, NULL, "2*0"},
-
-        {  7, NULL, "(7)"},
-        {  7, NULL, "((7))"},
-        { 21, NULL, "(1+2)*(3+4)"},
-        { 14, NULL, "(4*5)-(-2*-3)"},
-
-        {  1, NULL, "(1+2)*3==9"},
-        {  1, NULL, "(1+2)*3!=8"},
-        {  0, NULL, "(1+2)*3!=9"},
-        {  0, NULL, "(1+2)*3==8"},
-
-        {  0, NULL, "1>2"},
-        {  1, NULL, "1<2"},
-        {  0, NULL, "3<3"},
-        {  0, NULL, "3>3"},
-        {  1, NULL, "9<=9"},
-        {  1, NULL, "9>=9"},
-        {  1, NULL, "2*4==8"},
-        {  1, NULL, "16==0x10"},
-        {  1, NULL, "15<0x10"},
-        {  1, NULL, "17>0x10"},
-        {  0, NULL, "2*4!=8"},
-        {  1, NULL, "4+2<3+4"},
-        {  0, NULL, "4*2<3+4"},
-        {  8, NULL, "4*(2<3)+4"},  // boolean; 4*(1)+4
-
-        {  1, NULL, "(1<2) == (3>2)"},
-        {  1, NULL, "1<2 == 3>2"},
-
-        {  1, NULL, "2 && 1"},
-        {  0, NULL, "2 && 0"},
-        {  0, NULL, "0 && 2"},
-        {  1, NULL, "2 || 1"},
-        {  1, NULL, "2 || 0"},
-        {  1, NULL, "0 || 2"},
-        {  1, NULL, "1 || 2 && 3"},
-        {  1, NULL, "2 && 3 || 1"},
-        {  1, NULL, "0 && 3 || 2"},
-        {  0, NULL, "0 && 3 || 0"},
-
-        {  1, NULL, "3 & 1"},
-        {  2, NULL, "3 & 2"},
-        {  3, NULL, "1 | 2"},
-        {  3, NULL, "1 | 3"},
-        {  7, NULL, "1 | 6"},
-        {  2, NULL, "1 ^ 3"},
-
-        {  1, NULL, "(1^0)&(4^3)"},
-        {  2, NULL, "1 ^(0&4)^ 3"},
-        {  2, NULL, "1 ^ 0&4 ^ 3"},  // precedence, & before ^
-
-        {  6, NULL, "(1|0)^(4|3)"},
-        {  7, NULL, "1 |(0^4)| 3"},
-        {  7, NULL, "1 | 0^4 | 3"},  // precedence, ^ before |
-
-        {  1, NULL, "4 & 2 || 1"},
-        {  1, NULL, "(4 & 2) || 1"},
-        {  0, NULL, "4 & (2 || 1)"},
-        {  1, NULL, "1 || 4 & 2"},
-        {  1, NULL, "1 || (4 & 2)"},
-        {  0, NULL, "(1 || 4) & 2"},
-
-        {  1, NULL, " (2*3)&7  > 4"},
-        {  0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv
-        {  1, NULL, "((2*3)&7) > 4"},  // Python precedence equiv
-        {  1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"},
-
-        {  1, "plugh", "magic"},
-        {  1, "",   "empty"},
-        {  1, NULL, "magic == \"plugh\""},
-        {  1, NULL, "magic != \"xyzzy\""},
-
-        {  1, NULL, "\"abc\" < \"def\""},
-        {  1, NULL, "\"abc\" <= \"abc\""},
-        {  0, NULL, "\"abc\" < \"ab\""},
-        {  0, NULL, "\"abc\" <= \"ab\""},
-
-        {  0, NULL, "\"abc\" > \"def\""},
-        {  1, NULL, "\"abc\" >= \"abc\""},
-        {  1, NULL, "\"abc\" > \"ab\""},
-        {  1, NULL, "\"abc\" >= \"ab\""},
-
-        {  1, NULL, "\"abbc\" =~ \"^a+b+c+$\""},
-        {  0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""},
-        {  1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""},
-        {  1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"},
+        { 1,  1, NULL, "1"},
+        { 1,  1, NULL, "+1"},
+        { 1, -1, NULL, "-1"},
+        { 0,  0, NULL, "!7"},
+        { 1,  1, NULL, "!0"},
+        { 1,  1, NULL, "!(!7)"},
+        { 1,  1, NULL, "!!7"},
+
+        { 1,  5, NULL, "2+3"},
+        { 1, -1, NULL, "2+-3"},
+        { 1,  6, NULL, "1+2+3"},
+        { 1,  1, NULL, "-2+3"},
+
+        { 1,  6, NULL, "2*3"},
+        { 1,  6, NULL, "1*2*3"},
+        { 0,  0, NULL, "2*0"},
+
+        { 1,  7, NULL, "(7)"},
+        { 1,  7, NULL, "((7))"},
+        { 1, 21, NULL, "(1+2)*(3+4)"},
+        { 1, 14, NULL, "(4*5)-(-2*-3)"},
+
+        { 1,  1, NULL, "(1+2)*3==9"},
+        { 1,  1, NULL, "(1+2)*3!=8"},
+        { 0,  0, NULL, "(1+2)*3!=9"},
+        { 0,  0, NULL, "(1+2)*3==8"},
+
+        { 0,  0, NULL, "1>2"},
+        { 1,  1, NULL, "1<2"},
+        { 0,  0, NULL, "3<3"},
+        { 0,  0, NULL, "3>3"},
+        { 1,  1, NULL, "9<=9"},
+        { 1,  1, NULL, "9>=9"},
+        { 1,  1, NULL, "2*4==8"},
+        { 1,  1, NULL, "16==0x10"},
+        { 1,  1, NULL, "15<0x10"},
+        { 1,  1, NULL, "17>0x10"},
+        { 0,  0, NULL, "2*4!=8"},
+        { 1,  1, NULL, "4+2<3+4"},
+        { 0,  0, NULL, "4*2<3+4"},
+        { 1,  8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4
+
+        { 1,  1, NULL, "(1<2) == (3>2)"},
+        { 1,  1, NULL, "1<2 == 3>2"},
+
+        { 1,  1, NULL, "2 && 1"},
+        { 0,  0, NULL, "2 && 0"},
+        { 0,  0, NULL, "0 && 2"},
+        { 1,  1, NULL, "2 || 1"},
+        { 1,  1, NULL, "2 || 0"},
+        { 1,  1, NULL, "0 || 2"},
+        { 1,  1, NULL, "1 || 2 && 3"},
+        { 1,  1, NULL, "2 && 3 || 1"},
+        { 1,  1, NULL, "0 && 3 || 2"},
+        { 0,  0, NULL, "0 && 3 || 0"},
+
+        { 1,  1, NULL, "3 & 1"},
+        { 1,  2, NULL, "3 & 2"},
+        { 1,  3, NULL, "1 | 2"},
+        { 1,  3, NULL, "1 | 3"},
+        { 1,  7, NULL, "1 | 6"},
+        { 1,  2, NULL, "1 ^ 3"},
+
+        { 1,  1, NULL, "(1^0)&(4^3)"},
+        { 1,  2, NULL, "1 ^(0&4)^ 3"},
+        { 1,  2, NULL, "1 ^ 0&4 ^ 3"},  // precedence, & before ^
+
+        { 1,  6, NULL, "(1|0)^(4|3)"},
+        { 1,  7, NULL, "1 |(0^4)| 3"},
+        { 1,  7, NULL, "1 | 0^4 | 3"},  // precedence, ^ before |
+
+        { 1,  1, NULL, "4 & 2 || 1"},
+        { 1,  1, NULL, "(4 & 2) || 1"},
+        { 0,  0, NULL, "4 & (2 || 1)"},
+        { 1,  1, NULL, "1 || 4 & 2"},
+        { 1,  1, NULL, "1 || (4 & 2)"},
+        { 0,  0, NULL, "(1 || 4) & 2"},
+
+        { 1,  1, NULL, " (2*3)&7  > 4"},
+        { 0,  0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv
+        { 1,  1, NULL, "((2*3)&7) > 4"},  // Python precedence equiv
+        { 1,  1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"},
+
+        { 1,  1, "plugh", "magic"},
+        { 1,  1, "",  "empty"},
+        { 1,  1, NULL, "magic == \"plugh\""},
+        { 1,  1, NULL, "magic != \"xyzzy\""},
+
+        { 1,  1, NULL, "\"abc\" < \"def\""},
+        { 1,  1, NULL, "\"abc\" <= \"abc\""},
+        { 0,  0, NULL, "\"abc\" < \"ab\""},
+        { 0,  0, NULL, "\"abc\" <= \"ab\""},
+
+        { 0,  0, NULL, "\"abc\" > \"def\""},
+        { 1,  1, NULL, "\"abc\" >= \"abc\""},
+        { 1,  1, NULL, "\"abc\" > \"ab\""},
+        { 1,  1, NULL, "\"abc\" >= \"ab\""},
+
+        { 1,  1, NULL, "\"abbc\" =~ \"^a+b+c+$\""},
+        { 0,  0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""},
+        { 1,  1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""},
+        { 1,  1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"},
+
+        { 1,  1, "",   "empty-but-true" },
+        { 1,  1, NULL, "1 && empty-but-true && 1" },
+        { 0,  0, NULL, "1 && empty-but-true && 0" },
     };
 
     int i;
@@ -195,13 +206,18 @@ int test(void) {
         }
 
         if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0
-                         || r.d != tests[i].dval)) {
-            fprintf(stderr, "Failed test: %s == %s, got %s, %f\n",
-                    tests[i].str, tests[i].sval, r.s.s, r.d);
+                         || r.d != tests[i].dval
+                         || r.is_true != tests[i].truth_val)) {
+            fprintf(stderr,
+                    "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n",
+                    tests[i].str, tests[i].sval,
+                    r.is_true ? "true" : "false", r.s.s, r.d);
             return 1;
-        } else if (!r.is_str && r.d != tests[i].dval) {
-            fprintf(stderr, "Failed test: %s == %f, got %f\n",
-                    tests[i].str, tests[i].dval, r.d);
+        } else if (!r.is_str && (r.d != tests[i].dval
+                                 || r.is_true != tests[i].truth_val)) {
+            fprintf(stderr, "Failed test: %s == %f, got %s, %f\n",
+                    tests[i].str, tests[i].dval,
+                    r.is_true ? "true" : "false", r.d);
             return 1;
         }
 

From 494eca899e5f20c71f753a7db31dad2ccf2bb07f Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 12:26:04 +0100
Subject: [PATCH 46/79] Fix unary-not on null strings in expressions

Toggling hts_expr_val::is_true on strings could get it out of
phase with hts_expr_val::d on null strings (which are false),
which caused double-unary-not to give the wrong value.

Instead, make unary not always return false if is_true is true,
so empty-but-true works; and for strings return true for null
ones, and false for non-null.  Numbers are handled as before.
---
 hts_expr.c       |  8 +++++---
 test/test_expr.c | 29 ++++++++++++++++++++++-------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/hts_expr.c b/hts_expr.c
index 23f7c402b..cdbcec8a3 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -296,10 +296,12 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         res->is_true = res->d != 0;
     } else if (*str == '!') {
         err = unary_expr(filt, data, fn, str+1, end, res);
-        if (res->is_str) {
+        if (res->is_true) {
+            res->d = res->is_true = 0;
             res->is_str = 0;
-            res->d = 0;
-            res->is_true = !res->is_true;
+        } else if (res->is_str) {
+            res->is_str = 0;
+            res->d = res->is_true = (res->s.s == NULL);
         } else {
             res->d = !(int64_t)res->d;
             res->is_true = res->d != 0;
diff --git a/test/test_expr.c b/test/test_expr.c
index 237b4e1ce..673fdf807 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -82,6 +82,13 @@ typedef struct {
     char *str;
 } test_ev;
 
+static inline int strcmpnull(const char *a, const char *b) {
+    if (!a && !b) return  0;
+    if (!a &&  b) return -1;
+    if (a  && !b) return  1;
+    return strcmp(a, b);
+}
+
 int test(void) {
     // These are all valid expressions that should work
     test_ev tests[] = {
@@ -188,12 +195,18 @@ int test(void) {
         { 1,  1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""},
         { 1,  1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"},
 
-        { 1,  1, "",   "empty-but-true" },
+        { 1,  1, "",   "empty-but-true"   },
+        { 0,  0, NULL, "!empty-but-true"  },
+        { 1,  1, NULL, "!!empty-but-true" },
         { 1,  1, NULL, "1 && empty-but-true && 1" },
         { 0,  0, NULL, "1 && empty-but-true && 0" },
+
+        { 0,  0, NULL, "null"    },
+        { 1,  1, NULL, "!null"   },
+        { 0,  0, NULL, "!!null", },
     };
 
-    int i;
+    int i, res = 0;
     hts_expr_val_t r;
     for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
         hts_filter_t *filt = hts_filter_init(tests[i].str);
@@ -202,30 +215,32 @@ int test(void) {
         if (hts_filter_eval(filt, NULL, lookup, &r)) {
             fprintf(stderr, "Failed to parse filter string %s\n",
                     tests[i].str);
-            return 1;
+            res = 1;
+            hts_filter_free(filt);
+            continue;
         }
 
-        if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0
+        if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0
                          || r.d != tests[i].dval
                          || r.is_true != tests[i].truth_val)) {
             fprintf(stderr,
                     "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n",
                     tests[i].str, tests[i].sval,
                     r.is_true ? "true" : "false", r.s.s, r.d);
-            return 1;
+            res = 1;
         } else if (!r.is_str && (r.d != tests[i].dval
                                  || r.is_true != tests[i].truth_val)) {
             fprintf(stderr, "Failed test: %s == %f, got %s, %f\n",
                     tests[i].str, tests[i].dval,
                     r.is_true ? "true" : "false", r.d);
-            return 1;
+            res = 1;
         }
 
         hts_expr_val_free(&r);
         hts_filter_free(filt);
     }
 
-    return 0;
+    return res;
 }
 
 int main(int argc, char **argv) {

From 5e426826d0b9ad5bd3d47cff5e18ae7ab2981de1 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 16:41:18 +0100
Subject: [PATCH 47/79] Set hts_filter_t::is_true in mul_expr() and add_expr()

Ensures that "5 - 5 && 1" and "+5 - 5 && 1" give the same answer.
The latter sets is_true in the unary +, so it has to be reset
after the subtraction.
---
 hts_expr.c       | 4 ++++
 test/test_expr.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/hts_expr.c b/hts_expr.c
index cdbcec8a3..b38a006bc 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -352,8 +352,10 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         else
             break;
 
+        res->is_true = res->d != 0;
         str = *end;
     }
+
     hts_expr_val_free(&val);
 
     return 0;
@@ -390,8 +392,10 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         else
             break;
 
+        res->is_true = res->d != 0;
         str = *end;
     }
+
     hts_expr_val_free(&val);
 
     return 0;
diff --git a/test/test_expr.c b/test/test_expr.c
index 673fdf807..0d99c1121 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -147,6 +147,8 @@ int test(void) {
         { 1,  1, NULL, "2 && 3 || 1"},
         { 1,  1, NULL, "0 && 3 || 2"},
         { 0,  0, NULL, "0 && 3 || 0"},
+        { 0,  0, NULL, " 5 - 5 && 1"},
+        { 0,  0, NULL, "+5 - 5 && 1"},
 
         { 1,  1, NULL, "3 & 1"},
         { 1,  2, NULL, "3 & 2"},

From 6c733244f5359064f0126e6fbfa9002aafc006fc Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 17:23:10 +0100
Subject: [PATCH 48/79] Ensure is_true is propagated for null-but-true in
 expressions

So "null-but-true" and "null-but-true && 1" return the same value.
---
 hts_expr.c       | 13 ++++++++-----
 test/test_expr.c | 12 ++++++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/hts_expr.c b/hts_expr.c
index b38a006bc..8d1245350 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -700,12 +700,15 @@ int hts_filter_eval(hts_filter_t *filt,
     }
 
     // Strings evaluate to true.  An empty string is also true, but an
-    // absent (null) string is false.  An empty string has kstring length
-    // of zero, but a pointer as it's nul-terminated.
-    if (res->is_str)
-        res->is_true = res->d = res->s.s != NULL;
-    else
+    // absent (null) string is false, unless overriden by is_true.  An
+    // empty string has kstring length of zero, but a pointer as it's
+    // nul-terminated.
+    if (res->is_str) {
+        res->is_true |= res->s.s != NULL;
+        res->d = res->is_true;
+    } else {
         res->is_true |= res->d != 0;
+    }
 
     return 0;
 }
diff --git a/test/test_expr.c b/test/test_expr.c
index 0d99c1121..8944ca927 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -62,6 +62,11 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) {
         *end = str+5;
         res->is_str = 1;
         kputs("", ks_clear(&res->s));
+    } else if (strncmp(str, "null-but-true", 13) == 0) {
+        *end = str+13;
+        res->is_true = 1;
+        res->is_str = 1;
+        ks_clear(&res->s);
     } else if (strncmp(str, "null", 4) == 0) {
         // null string (eg aux:Z tag is absent)
         *end = str+4;
@@ -206,6 +211,13 @@ int test(void) {
         { 0,  0, NULL, "null"    },
         { 1,  1, NULL, "!null"   },
         { 0,  0, NULL, "!!null", },
+
+        { 1,  1, NULL, "null-but-true"   },
+        { 0,  0, NULL, "!null-but-true"  },
+        { 1,  1, NULL, "!!null-but-true" },
+
+        { 0,  0, NULL, "null || 0" },
+        { 1,  1, NULL, "null-but-true && 1" },
     };
 
     int i, res = 0;

From 1fba06c2c086290e21bd50ef6f8244dcde80c764 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 29 Jun 2022 17:35:46 +0100
Subject: [PATCH 49/79] Deprecate hts_filter_eval() in favour of
 hts_filter_eval2().

Due to hts_filter_eval() calling memset() on its res parameter,
it's not possible to pass in an allocted kstring_t in res->s
without leaking memory.  Historically it was also possible
to get away with passing in an uninitialised structure, so not
many assumptions can be made about the contents of res on entry.
In particular, it is not guaranteed that free(res->s.s) would
work.

To ensure the function is being used safely, check that the
string part of *res is NULL on entry and fail if not.
Also added a documentation note about calling hts_expr_val_free()
after hts_filter_eval().

Add hts_filter_eval2() and deprecate hts_filter_eval().  The new
function clears its `res` parameter properly, allowing it to
be called repeatedly a bit more easily than the original.
---
 Makefile          |  2 +-
 hts_expr.c        | 35 ++++++++++++++++++++++++++++++-----
 htslib/hts_expr.h | 22 +++++++++++++++++++++-
 sam.c             |  4 ++--
 test/test_expr.c  |  8 ++++----
 5 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 621e8a11a..88d1f56b6 100644
--- a/Makefile
+++ b/Makefile
@@ -425,7 +425,7 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h)
 hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h)
 hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h)
 hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h)
-hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h)
+hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h)
 hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
 vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
 sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
diff --git a/hts_expr.c b/hts_expr.c
index 8d1245350..74fe85fce 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include <math.h>
 
 #include "htslib/hts_expr.h"
+#include "htslib/hts_log.h"
 #include "textutils_internal.h"
 
 // Could also cache hts_expr_val_t stack here for kstring reuse?
@@ -683,13 +684,11 @@ void hts_filter_free(hts_filter_t *filt) {
     free(filt);
 }
 
-int hts_filter_eval(hts_filter_t *filt,
-                    void *data, hts_expr_sym_func *fn,
-                    hts_expr_val_t *res) {
+static int hts_filter_eval_(hts_filter_t *filt,
+                            void *data, hts_expr_sym_func *fn,
+                            hts_expr_val_t *res) {
     char *end = NULL;
 
-    memset(res, 0, sizeof(*res));
-
     filt->curr_regex = 0;
     if (expression(filt, data, fn, filt->str, &end, res))
         return -1;
@@ -712,3 +711,29 @@ int hts_filter_eval(hts_filter_t *filt,
 
     return 0;
 }
+
+int hts_filter_eval(hts_filter_t *filt,
+                    void *data, hts_expr_sym_func *fn,
+                    hts_expr_val_t *res) {
+    if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) {
+        // As *res is cleared below, it's not safe to call this function
+        // with res->s.s set, as memory would be leaked.  It's also not
+        // possible to know is res was initialised correctly, so in
+        // either case we fail.
+        hts_log_error("Results structure must be cleared before calling this function");
+        return -1;
+    }
+
+    memset(res, 0, sizeof(*res));
+
+    return hts_filter_eval_(filt, data, fn, res);
+}
+
+int hts_filter_eval2(hts_filter_t *filt,
+                     void *data, hts_expr_sym_func *fn,
+                     hts_expr_val_t *res) {
+    ks_free(&res->s);
+    memset(res, 0, sizeof(*res));
+
+    return hts_filter_eval_(filt, data, fn, res);
+}
diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h
index d66a8edd8..7e6a9ed2b 100644
--- a/htslib/hts_expr.h
+++ b/htslib/hts_expr.h
@@ -87,11 +87,31 @@ typedef int (hts_expr_sym_func)(void *data, char *str, char **end,
  *  the is_str member.  It can also be explicitly defined to be true even
  *  for a null value.  This may be used to check for the existence of
  *  something, irrespective of whether that something evaluates to zero.
+ *
+ *  @p res must be initialized using HTS_EXPR_VAL_INIT before passing it
+ *  to this function for the first time.
+ */
+HTSLIB_EXPORT
+int hts_filter_eval2(hts_filter_t *filt,
+                     void *data, hts_expr_sym_func *sym_func,
+                     hts_expr_val_t *res);
+
+/// Evaluate a filter expression (derecated API)
+/**
+ *  @copydetails hts_filter_eval2()
+ *
+ *  If calling this function more than once with the same @p res
+ *  parameter, hts_expr_val_free(res) must be used between invocations
+ *  to clear any allocated memory prior to reuse.
+ *
+ *  @deprecated This function has been replaced by hts_filter_eval2(),
+ *              which clears @p res properly itself.
  */
 HTSLIB_EXPORT
 int hts_filter_eval(hts_filter_t *filt,
                     void *data, hts_expr_sym_func *sym_func,
-                    hts_expr_val_t *res);
+                    hts_expr_val_t *res)
+    HTS_DEPRECATED("Please use hts_filter_eval2 instead");
 
 
 #endif /* HTS_EXPR_H */
diff --git a/sam.c b/sam.c
index a84ac2eaa..fccff262b 100644
--- a/sam.c
+++ b/sam.c
@@ -1454,8 +1454,8 @@ static int bam_sym_lookup(void *data, char *str, char **end,
 int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
 {
     hb_pair hb = {h, b};
-    hts_expr_val_t res;
-    if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) {
+    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
+    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
         hts_log_error("Couldn't process filter expression");
         hts_expr_val_free(&res);
         return -1;
diff --git a/test/test_expr.c b/test/test_expr.c
index 8944ca927..15e25cf4b 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -221,12 +221,12 @@ int test(void) {
     };
 
     int i, res = 0;
-    hts_expr_val_t r;
+    hts_expr_val_t r = HTS_EXPR_VAL_INIT;
     for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
         hts_filter_t *filt = hts_filter_init(tests[i].str);
         if (!filt)
             return 1;
-        if (hts_filter_eval(filt, NULL, lookup, &r)) {
+        if (hts_filter_eval2(filt, NULL, lookup, &r)) {
             fprintf(stderr, "Failed to parse filter string %s\n",
                     tests[i].str);
             res = 1;
@@ -259,9 +259,9 @@ int test(void) {
 
 int main(int argc, char **argv) {
     if (argc > 1) {
-        hts_expr_val_t v;
+        hts_expr_val_t v = HTS_EXPR_VAL_INIT;
         hts_filter_t *filt = hts_filter_init(argv[1]);
-        if (hts_filter_eval(filt, NULL, lookup, &v))
+        if (hts_filter_eval2(filt, NULL, lookup, &v))
             return 1;
 
         if (v.is_str)

From 19c72628618eb1aaeb965ef06d81255fe0af1c17 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 18 Jul 2022 11:34:38 +0100
Subject: [PATCH 50/79] Switch to rockylinux:9

Change rockylinux docker image to rockylinux:9 following
deprecation of rockylinux:latest.

Add perl-FindBin to installation list, as it's now in its own
package.
---
 .cirrus.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 79aa2f99b..3a7b910a5 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -121,7 +121,7 @@ ubuntu_task:
 rocky_task:
   name: rockylinux-gcc
   container:
-    image: rockylinux:latest
+    image: rockylinux:9
     cpu: 2
     memory: 1G
 
@@ -133,9 +133,9 @@ rocky_task:
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.
   install_script: |
-    yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \
-        bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \
-        diffutils git
+    yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \
+        zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \
+        ncurses-devel diffutils git
 
   << : *COMPILE
   << : *TEST

From 9562aebaa39265b7c96e6a6f5cf6f78727c41221 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 19 Jul 2022 18:16:45 +0100
Subject: [PATCH 51/79] Improve embed_ref=2 and auto-enable it when missing
 external refs (PR #1449)

* Improve embed_ref=2 when no MD:Z tags are present.

When these tags are present, the reference is inferred so it's much
the same as before.

However when not present, we previously took a first-come first-serve
approach to consensus generation as the just-in-time reference
generation was in the process_one_read loop.  As we'd already updated
the reference and delta-encoded against it, if a later read comes
along with conflicts then we couldn't correct anything.

33ff2bc incorrectly made the assumption that we were being fed a bunch
of bam records and didn't have all the data available up-front.
However this isn't true.  For the sake of threading efficiency, the
encoder buffers up all the BAMs for a container before dispatching the
cram_encode_container call, so we can do a consensus generation stage
prior to the process_one_read loop.  This improves consensus accuracy
and also simplifies the MD:Z using code too as it doesn't have to
worry about inconsistencies any more.

On 10MB worth of ONT data we had the following total file and SEQ sizes:

                   Total      SEQ only
    no_ref         205899946  74416907
    embed_ref=0    135303919  27583376  use external ref
    embed_ref=1    139709759  32008185  embed external ref, from file
    embed_ref=2+MD 139710386  32007405  embed external ref, inferred
    embed_ref=2-MD 149371457  41670856  OLD: first-come first-serve cons
    embed_ref=2-MD 139598781  31898253  NEW: proper consensus

The old embed_ref=2 needed 30% more space to delta-encode seqs,
although it was still a major win over using no_ref mode.  The new
consensus is is a significant improvement and it's now a better match
for the sequence (as we'd expect) than the external reference.
Obviously this can't be done when we have MD tags as otherwise we'd
need to store them verbatim which then takes more space up again.

On Illumina NovaSeq we also see a significant reduction to the
embedding overheads:

                   Total      SEQ only
    no_ref         256252015  87171589  48.4s
    embed_ref=0    207793068  38761828  34.7s
    embed_ref=1    219064424  50021186  35.3s
    embed_ref=2-MD 231012254  61929861  46.2s OLD
    embed_ref=2-MD 218014361  48935989  38.9s NEW

The code has also been refactored in a few places to speed it up.
Specifically with the newer consensus building strategy, we no longer
produce fake MD tags and just call the earlier algorithm, and instead
simply add directly to consensus histograms.

TODO: figure out how to enable this automatically, without penalty
when we do have the external reference available.

* Add code to enable embed_ref=2 when external reference isn't found,
  and when converting BAMs with no UR or M5 tags.

This choice is debatable, as it means people may get larger files than
they expect, but it also means the CRAM can be built even if
suboptimal.  A warning is given notifying the user of this action.

If a user explicitly species embed_ref=0 then this disables this
automatic code and will turn the warning back into a hard failure.

Also added more checks for handling things like unsorted data or
multi_seq_per_slice mode, neither of which are compatible with
embedding references (both the original =1 mode or =2).

* Work around embed_ref=2 failures when MD is inconsistent.

It's too harsh to simply bail out with invalid data.  Instead we just
nullify the portion of reference computed with this record and drop
back to the consensus method.

Also replaced hts_log_warning messages with hts_log_info, as typically
we either get no messages or a huge proportion, as MD being wrong
quite often means some large scale reprocessing happened.

Tested using soft-clip adjusted covid19 samples where almost every MD
tag was incorrect.

* Make fd->embed_ref a local variable to many functions.

This field can be changed on-the-fly by cram_encode_container when it
discovers it cannot find a reference.  It switches to embed_ref=2 in
this case.  Due to the multi-threaded nature, it is important that all
running code elsewhere is using their embed_ref parameter at the time
the container was created.

This opens up an avenue for downgrading embed_ref again if we
require (eg a reference is absent, but subsequent ones can be found),
but for now we don't support this.
---
 Makefile             |   2 +-
 cram/cram_encode.c   | 572 +++++++++++++++++++++++++------------------
 cram/cram_encode.h   |   3 +-
 cram/cram_external.c |   2 +-
 cram/cram_io.c       |  26 +-
 cram/cram_structs.h  |   1 -
 6 files changed, 361 insertions(+), 245 deletions(-)

diff --git a/Makefile b/Makefile
index 88d1f56b6..dfb666c91 100644
--- a/Makefile
+++ b/Makefile
@@ -447,7 +447,7 @@ textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstr
 
 cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h)
 cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h)
-cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h)
+cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h)
 cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h)
 cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h)
 cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 368f6e0b4..d3dd7a134 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "../sam_internal.h" // for nibble2base
 #include "../htslib/hts.h"
 #include "../htslib/hts_endian.h"
+#include "../textutils_internal.h"
 
 KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
 
@@ -59,7 +60,8 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
 
 static int process_one_read(cram_fd *fd, cram_container *c,
                             cram_slice *s, cram_record *cr,
-                            bam_seq_t *b, int rnum, kstring_t *MD);
+                            bam_seq_t *b, int rnum, kstring_t *MD,
+                            int embed_ref);
 
 /*
  * Returns index of val into key.
@@ -79,7 +81,8 @@ static int sub_idx(char *key, char val) {
  *         NULL on failure
  */
 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
-                                           cram_block_compression_hdr *h) {
+                                           cram_block_compression_hdr *h,
+                                           int embed_ref) {
     cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
     cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
     int i, mc, r = 0;
@@ -159,7 +162,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
                 kh_val(h->preservation_map, k).i = h->qs_seq_orient;
             }
 
-            if (fd->no_ref || fd->embed_ref) {
+            if (fd->no_ref || embed_ref>0) {
                 // Reference Required == No
                 k = kh_put(map, h->preservation_map, "RR", &r);
                 if (-1 == r) return NULL;
@@ -1076,14 +1079,12 @@ static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
  *        -1 on failure
  */
 static int cram_encode_slice(cram_fd *fd, cram_container *c,
-                             cram_block_compression_hdr *h, cram_slice *s) {
+                             cram_block_compression_hdr *h, cram_slice *s,
+                             int embed_ref) {
     int rec, r = 0;
     int64_t last_pos;
-    int embed_ref;
     enum cram_DS_ID id;
 
-    embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
-
     /*
      * Slice external blocks:
      * ID 0 => base calls (insertions, soft-clip)
@@ -1096,7 +1097,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c,
      */
 
     /* Create cram slice header */
-    s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0
+    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
         ? DS_ref
         : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
     s->hdr->record_counter = c->num_records + c->record_counter;
@@ -1124,7 +1125,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c,
     }
 
     // Embedded reference
-    if (embed_ref) {
+    if (embed_ref>0) {
         if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
             return -1;
         s->ref_id = DS_ref; // needed?
@@ -1401,6 +1402,292 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
 // CRAM version >= 3.1
 #define CRAM_ge31(v) ((v) >= 0x301)
 
+// Returns the next cigar op code: one of the BAM_C* codes,
+// or -1 if no more are present.
+static inline
+int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
+                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
+    for(;;) {
+        while (*cig_len == 0) {
+            if (*cig_ind < ncigar) {
+                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
+                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
+                (*cig_ind)++;
+            } else {
+                return -1;
+            }
+        }
+
+        if (skip[*cig_op]) {
+            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
+            *cig_len = 0;
+            continue;
+        }
+
+        (*cig_len)--;
+        break;
+    }
+
+    return *cig_op;
+}
+
+// Ensure ref and hist are large enough.
+static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
+                             hts_pos_t ref_start, hts_pos_t *ref_end) {
+    if (pos < ref_start)
+        return -1;
+    if (pos < *ref_end)
+        return 0;
+
+    // realloc
+    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
+    hts_pos_t new_end = *ref_end = ref_start + 1000 + (pos-ref_start)*1.5;
+
+    char *tmp = realloc(*ref, *ref_end-ref_start);
+    if (!tmp)
+        return -1;
+    *ref = tmp;
+
+    uint32_t (*tmp5)[5] = realloc(**hist,
+                                  (*ref_end - ref_start)*sizeof(**hist));
+    if (!tmp5)
+        return -1;
+    *hist = tmp5;
+    *ref_end = new_end;
+
+    // initialise
+    old_end -= ref_start;
+    new_end -= ref_start;
+    memset(&(*ref)[old_end],  0,  new_end-old_end);
+    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
+
+    return 0;
+}
+
+// Walk through MD + seq to generate ref
+static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
+                              hts_pos_t ref_start, hts_pos_t *ref_end,
+                              const uint8_t *MD) {
+    uint8_t *seq = bam_get_seq(b);
+    uint32_t *cigar = bam_get_cigar(b);
+    uint32_t ncigar = b->core.n_cigar;
+    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
+
+    int iseq = 0, next_op;
+    hts_pos_t iref = b->core.pos - ref_start;
+
+    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
+    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
+    while (iseq < b->core.l_qseq && *MD) {
+        if (isdigit(*MD)) {
+            // match
+            int overflow = 0;
+            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
+            if (overflow ||
+                extend_ref(ref, hist, iref+ref_start + len,
+                           ref_start, ref_end) < 0)
+                return -1;
+            while (iseq < b->core.l_qseq && len) {
+                // rewrite to have internal loops?
+                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
+                                             &iseq, &cig_ind, &cig_op,
+                                             &cig_len)) < 0)
+                    return -1;
+
+                if (next_op != BAM_CMATCH &&
+                    next_op != BAM_CEQUAL) {
+                    hts_log_info("MD:Z and CIGAR are incompatible for "
+                                 "record %s", bam_get_qname(b));
+                    return -1;
+                }
+
+                // Short-cut loop over same cigar op for efficiency
+                cig_len++;
+                do {
+                    cig_len--;
+                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
+                    iseq++;
+                    len--;
+                } while (cig_len && iseq < b->core.l_qseq && len);
+            }
+            if (len > 0)
+                return -1; // MD is longer than seq
+        } else if (*MD == '^') {
+            // deletion
+            MD++;
+            while (isalpha(*MD)) {
+                if (extend_ref(ref, hist, iref+ref_start, ref_start,
+                               ref_end) < 0)
+                    return -1;
+                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
+                                             &iseq, &cig_ind, &cig_op,
+                                             &cig_len)) < 0)
+                    return -1;
+
+                if (next_op != BAM_CDEL) {
+                    hts_log_info("MD:Z and CIGAR are incompatible");
+                    return -1;
+                }
+
+                (*ref)[iref++] = *MD++ & ~0x20;
+            }
+        } else {
+            // substitution
+            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
+                return -1;
+            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
+                                         &iseq, &cig_ind, &cig_op,
+                                         &cig_len)) < 0)
+                return -1;
+
+            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
+                hts_log_info("MD:Z and CIGAR are incompatible");
+                return -1;
+            }
+
+            (*ref)[iref++] = *MD++ & ~0x20;
+            iseq++;
+        }
+    }
+
+    return 1;
+}
+
+// Append a sequence to a ref/consensus structure.
+// We maintain both an absolute refefence (ACGTN where MD:Z is
+// present) and a 5-way frequency array for when no MD:Z is known.
+// We then subsequently convert the 5-way frequencies to a consensus
+// ref in a second pass.
+//
+// Returns >=0 on success,
+//         -1 on failure (eg inconsistent data)
+static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
+                           hts_pos_t ref_start, hts_pos_t *ref_end) {
+    const uint8_t *MD = bam_aux_get(b, "MD");
+    int ret = 0;
+    if (MD && *MD == 'Z') {
+        // We can use MD to directly compute the reference
+        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
+
+        if (ret > 0)
+            return ret;
+    }
+
+    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
+    // turn into a fake reference
+    uint32_t *cigar = bam_get_cigar(b);
+    uint32_t ncigar = b->core.n_cigar;
+    uint32_t i, j;
+    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
+    uint8_t *seq = bam_get_seq(b);
+    for (i = 0; i < ncigar; i++) {
+        switch (bam_cigar_op(cigar[i])) {
+        case BAM_CSOFT_CLIP:
+        case BAM_CINS:
+            iseq += bam_cigar_oplen(cigar[i]);
+            break;
+
+        case BAM_CMATCH:
+        case BAM_CEQUAL:
+        case BAM_CDIFF: {
+            int len = bam_cigar_oplen(cigar[i]);
+            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
+            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
+
+            if (extend_ref(ref, hist, iref+ref_start + len,
+                           ref_start, ref_end) < 0)
+                return -1;
+            if (iseq + len <= b->core.l_qseq) {
+                // Nullify failed MD:Z if appropriate
+                if (ret < 0)
+                    memset(&(*ref)[iref], 0, len);
+
+                for (j = 0; j < len; j++, iref++, iseq++)
+                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
+            } else {
+                // Probably a 2ndary read with seq "*"
+                iseq += len;
+                iref += len;
+            }
+            break;
+        }
+
+        case BAM_CDEL:
+        case BAM_CREF_SKIP:
+            iref += bam_cigar_oplen(cigar[i]);
+        }
+    }
+
+    return 1;
+}
+
+// Automatically generates the reference and stashed it in c->ref, also
+// setting c->ref_start and c->ref_end.
+//
+// If we have MD:Z tags then we use them to directly infer the reference,
+// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
+// a consensus and then assume the reference as the majority rule.
+//
+// In this latter scenario we need to be wary of auto-generating MD and NM
+// during decode, but that's handled elsewhere via an additional aux tag.
+//
+// Returns 0 on success,
+//        -1 on failure
+static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
+    // TODO: if we can find an external reference then use it, even if the
+    // user told us to do embed_ref=2.
+    char *ref = NULL;
+    uint32_t (*hist)[5] = NULL;
+    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
+
+    // initial allocation
+    if (extend_ref(&ref, &hist,
+                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
+                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
+                   ref_start, &ref_end) < 0)
+        return -1;
+
+    // Add each bam file to the reference/consensus arrays
+    int r2;
+    hts_pos_t last_pos = -1;
+    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
+        if (c->bams[r1]->core.pos < last_pos) {
+            hts_log_error("Cannot build reference with unsorted data");
+            goto err;
+        }
+        last_pos = c->bams[r1]->core.pos;
+        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
+            goto err;
+    }
+
+    // Compute the consensus
+    hts_pos_t i;
+    for (i = 0; i < ref_end-ref_start; i++) {
+        if (!ref[i]) {
+            int max_v = 0, max_j = 4, j;
+            for (j = 0; j < 4; j++)
+                // don't call N (j==4) unless no coverage
+                if (max_v < hist[i][j])
+                    max_v = hist[i][j], max_j = j;
+            ref[i] = "ACGTN"[max_j];
+        }
+    }
+    free(hist);
+
+    // Put the reference in place so it appears to be an external
+    // ref file.
+    c->ref       = ref;
+    c->ref_start = ref_start+1;
+    c->ref_end   = ref_end+1;
+
+    return 0;
+
+ err:
+    free(ref);
+    free(hist);
+    return -1;
+}
+
 /*
  * Encodes all slices in a container into blocks.
  * Returns 0 on success
@@ -1411,7 +1698,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     cram_block_compression_hdr *h = c->comp_hdr;
     cram_block *c_hdr;
     int multi_ref = 0;
-    int r1, r2, sn, nref;
+    int r1, r2, sn, nref, embed_ref;
     spare_bams *spares;
 
     if (CRAM_MAJOR_VERS(fd->version) == 1)
@@ -1423,6 +1710,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     /* Cache references up-front if we have unsorted access patterns */
     pthread_mutex_lock(&fd->ref_lock);
     nref = fd->refs->nref;
+    embed_ref = fd->embed_ref;
     pthread_mutex_unlock(&fd->ref_lock);
 
     if (!fd->no_ref && c->refs_used) {
@@ -1439,11 +1727,23 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             goto_err;
         bam_seq_t *b = c->bams[0];
 
-        if (fd->embed_ref <= 1) {
+        if (embed_ref <= 1) {
             char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
             if (!ref && bam_ref(b) >= 0) {
-                hts_log_error("Failed to load reference #%d", bam_ref(b));
-                return -1;
+                if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) {
+                    hts_log_error("Failed to load reference #%d", bam_ref(b));
+                    return -1;
+                }
+                hts_log_warning("Failed to load reference #%d", bam_ref(b));
+                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
+                                " reference");
+                if (embed_ref <= 0)
+                    hts_log_warning("NOTE: the CRAM file will be bigger than"
+                                    " using an external reference");
+                pthread_mutex_lock(&fd->ref_lock);
+                embed_ref = fd->embed_ref = 2;
+                pthread_mutex_unlock(&fd->ref_lock);
+                goto auto_ref;
             }
             if ((c->ref_id = bam_ref(b)) >= 0) {
                 c->ref_seq_id = c->ref_id;
@@ -1452,6 +1752,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                 c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
             }
         } else {
+        auto_ref:
             // Auto-embed ref.
             // This starts as 'N' and is amended on-the-fly as we go
             // based on MD:Z tags.
@@ -1487,6 +1788,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         // is done within process_one_read().
         kstring_t MD = {0};
 
+        // Embed consensus / MD-generated ref
+        if (embed_ref == 2) {
+            if (cram_generate_reference(c, s, r1) < 0) {
+                hts_log_error("Failed to build reference");
+                return -1;
+            }
+        }
+
         // Iterate through records creating the cram blocks for some
         // fields and just gathering stats for others.
         for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
@@ -1514,7 +1823,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                 }
             }
 
-            if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) {
+            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) {
                 free(MD.s);
                 return -1;
             }
@@ -1525,6 +1834,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             if (last_base < cr->aend)
                 last_base = cr->aend;
         }
+
         free(MD.s);
 
         // Process_one_read doesn't add read names as it can change
@@ -1901,7 +2211,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     for (i = 0; i < c->curr_slice; i++) {
         hts_log_info("Encode slice %d", i);
 
-        if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
+        int local_embed_ref =
+            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
+        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
             return -1;
     }
 
@@ -1916,7 +2228,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         h->AP_delta      = c->pos_sorted;
         memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
 
-        if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
+        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
             return -1;
     }
 
@@ -2757,216 +3069,6 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
     return c;
 }
 
-// Returns the next cigar op code: one of the BAM_C* codes,
-// or -1 if no more are present.
-static inline
-int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos,
-                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
-    for(;;) {
-        while (*cig_len == 0) {
-            if (*cig_ind < *ncigar) {
-                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
-                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
-                (*cig_ind)++;
-            } else {
-                return -1;
-            }
-        }
-
-        if (skip[*cig_op]) {
-            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
-            *cig_len = 0;
-            continue;
-        }
-
-        (*cig_len)--;
-        break;
-    }
-
-    return *cig_op;
-}
-
-// Set a base in the computed reference.
-// As we fill this out record by record as we go, and we encode the
-// sequence against the reference we've computed so far, once we set a
-// reference is must never change.  So if the reference inferred by one
-// SEQ+MD differs to the reference inferred by another SEQ+MD, the latter
-// is warned about and the reference remains unchanged to ensure
-// round-trips.
-//
-// In order to spot N->N->G type edits, where "N" and "G" are two inferred
-// ref from two sequences, we use N->n->G and patch up the lowercase n later.
-// Similarly where the data is unvalidated (faked up MD tag) also get
-// assigned lowercase letters.  This prevents false warnings when mixing data
-// with and without MD tags.
-static inline void assign_ref(char *ref, char *set, int pos,
-                              unsigned char base, int validate) {
-    base = base & ~0x20; // fast toupper for ASCII
-#if 1
-    if (!set[pos] || ref[pos] == base) {
-        ref[pos] = base;
-        set[pos] = 1;
-    }
-#else
-    // Optional reporting.  It's 7% additional CPU cost in process_one_read,
-    // and maybe not appropriate anyway given there's nothing we can do to
-    // correct this either than ignore it.  It'd need update to explain the
-    // position too.
-    if (!set[pos] || ref[pos] == base) {
-        ref[pos] = base;
-        set[pos] = validate ? 1 : 2; // actual MD:Z or guesswork/fake
-    } else if (validate && set[pos] == 1) {
-        hts_log_warning("Incompatible MD:Z tags between records");
-    }
-#endif
-}
-
-static int cram_extend_ref(cram_container *c, bam1_t *b) {
-    hts_pos_t end = bam_endpos(b);
-
-    if (!c->ref)
-        c->ref_start = b->core.pos+1;
-        //c->ref_start = 1; // FIXME, needs to be b->core.pos, but fails
-
-    if (end >= c->ref_end) {
-        hts_pos_t old_end = c->ref ? c->ref_end : c->ref_start;
-        c->ref_end = end + 1000 + (end - c->ref_start)*1.5;
-
-        char *r = realloc(c->ref, c->ref_end+1 - c->ref_start);
-        if (!r) return -1;
-        c->ref = r;
-
-        r = realloc(c->ref_set, c->ref_end+1 - c->ref_start);
-        if (!r) return -1;
-        c->ref_set = r;
-
-        memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end);
-        memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end);
-        c->ref_free = 1;
-    }
-
-    return 0;
-}
-
-// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference.
-// Updates ref[] array.
-//
-// Returns >0 on success,
-//          0 on no-MD found,
-//         -1 on failure (eg inconsistent data)
-static int cram_build_ref(bam1_t *b, const uint8_t *MD,
-                          char *ref, char *ref_set,
-                          hts_pos_t ref_start, hts_pos_t ref_len) {
-    uint8_t *seq = bam_get_seq(b);
-    uint32_t *cigar = bam_get_cigar(b);
-    int ncigar = b->core.n_cigar;
-    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
-    kstring_t fake_MD = KS_INITIALIZE;
-    int validate = 1;
-
-    if (!MD || *MD != 'Z') {
-        // Fake it!
-        int i, err = 0;
-        int run_len = 0;
-        for (i = 0; i < ncigar; i++) {
-            switch(cigar[i] & BAM_CIGAR_MASK) {
-            case BAM_CMATCH:
-            case BAM_CEQUAL:
-            case BAM_CDIFF:
-                run_len += cigar[i] >> BAM_CIGAR_SHIFT;
-                break;
-
-            case BAM_CDEL:
-                err |= ksprintf(&fake_MD, "%d", run_len) < 0;
-                run_len = 0;
-                err |= kputc('^', &fake_MD);
-                for (int j = 0; j < cigar[i] >> BAM_CIGAR_SHIFT; j++)
-                    err |= kputc('N', &fake_MD);
-                break;
-            }
-        }
-        if (run_len)
-            err |= ksprintf(&fake_MD, "%d", run_len) < 0;
-        MD = (uint8_t *)fake_MD.s;
-        if (err < 0)
-            return -1;
-
-        validate = 0;
-    } else {
-        MD++;
-    }
-
-    // Walk through MD + seq to generate ref
-    int iseq = 0, next_op;
-    hts_pos_t iref = b->core.pos+1 - ref_start;
-    int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
-    while (iseq < b->core.l_qseq && MD && *MD) {
-        if (isdigit(*MD)) {
-            // match
-            int len = strtol((char *)MD, (char **)&MD, 10);
-            while (iseq < b->core.l_qseq && len) {
-                if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
-                                             &iseq, &cig_ind, &cig_op,
-                                             &cig_len)) < 0)
-                    return -1;
-
-                if (next_op != BAM_CMATCH &&
-                    next_op != BAM_CEQUAL) {
-                    hts_log_warning("MD:Z and CIGAR are incompatible for "
-                                    "record %s", bam_get_qname(b));
-                    return -1;
-                }
-
-                if (iref < ref_len)
-                    assign_ref(ref, ref_set, iref,
-                               seq_nt16_str[bam_seqi(seq, iseq)], validate);
-                iseq++;
-                iref++;
-                len--;
-            }
-        } else if (*MD == '^') {
-            // deletion
-            MD++;
-            while (*MD && isalpha(*MD)) {
-                if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
-                                             &iseq, &cig_ind, &cig_op,
-                                             &cig_len)) < 0)
-                    return -1;
-
-                if (next_op != BAM_CDEL) {
-                    hts_log_warning("MD:Z and CIGAR are incompatible");
-                    return -1;
-                }
-
-                if (iref < ref_len)
-                    assign_ref(ref, ref_set, iref, toupper(*MD), validate);
-
-                MD++;
-                iref++;
-            }
-        } else {
-            // substitution
-            if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
-                                         &iseq, &cig_ind, &cig_op,
-                                         &cig_len)) < 0)
-                return -1;
-
-            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
-                hts_log_warning("MD:Z and CIGAR are incompatible");
-                return -1;
-            }
-            if (iref < ref_len)
-                assign_ref(ref, ref_set, iref, toupper(*MD), validate);
-
-            MD++;
-            iref++;
-            iseq++;
-        }
-    }
-
-    ks_free(&fake_MD);
-    return 1;
-}
 
 /*
  * Converts a single bam record into a cram record.
@@ -2977,7 +3079,8 @@ static int cram_build_ref(bam1_t *b, const uint8_t *MD,
  */
 static int process_one_read(cram_fd *fd, cram_container *c,
                             cram_slice *s, cram_record *cr,
-                            bam_seq_t *b, int rnum, kstring_t *MD) {
+                            bam_seq_t *b, int rnum, kstring_t *MD,
+                            int embed_ref) {
     int i, fake_qual = -1, NM = 0;
     char *cp, *rg;
     char *ref, *seq, *qual;
@@ -3001,10 +3104,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
         MD->l = 0;
 
     int cf_tag = 0;
-    if (/*md &&*/ fd->embed_ref == 2) {
-        // Auto-generate and embed ref
-        cram_extend_ref(c, b);
-        cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end);
+
+    if (embed_ref == 2) {
         cf_tag  = MD ? 0 : 1;                   // No MD
         cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
     }
@@ -3638,9 +3739,12 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
          * The multi_seq var here refers to our intention for the next slice.
          * This slice has already been encoded so we output as-is.
          */
+        pthread_mutex_lock(&fd->ref_lock);
+        int embed_ref = fd->embed_ref;
+        pthread_mutex_unlock(&fd->ref_lock);
         if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
             fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
-            !fd->embed_ref) {
+            embed_ref<=0) {
             if (!c->multi_seq)
                 hts_log_info("Multi-ref enabled for next container");
             multi_seq = 1;
@@ -3698,8 +3802,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
         c->slice_rec = c->curr_rec;
 
         // Have we seen this reference before?
-        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
-            !fd->unsorted && multi_seq) {
+        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
+            embed_ref<=0 && !fd->unsorted && multi_seq) {
 
             if (!c->refs_used) {
                 pthread_mutex_lock(&fd->ref_lock);
diff --git a/cram/cram_encode.h b/cram/cram_encode.h
index 7cccae9af..03b8054e8 100644
--- a/cram/cram_encode.h
+++ b/cram/cram_encode.h
@@ -74,7 +74,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b);
  *         NULL on failure
  */
 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
-                                           cram_block_compression_hdr *h);
+                                           cram_block_compression_hdr *h,
+                                           int embed_ref);
 
 /*! INTERNAL:
  * Encodes a slice compression header.
diff --git a/cram/cram_external.c b/cram/cram_external.c
index 098accde9..329f1ec63 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -331,7 +331,7 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
         return -1;
     if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0)
         return -1;
-    n_blk = cram_encode_compression_header(in, c, ch);
+    n_blk = cram_encode_compression_header(in, c, ch, in->embed_ref);
     cram_free_compression_header(ch);
 
     /*
diff --git a/cram/cram_io.c b/cram/cram_io.c
index e0d203469..968b7b119 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -3561,7 +3561,7 @@ int cram_load_reference(cram_fd *fd, char *fn) {
 
     if (fn) {
         fd->refs = refs_load_fai(fd->refs, fn,
-                                 !(fd->embed_ref && fd->mode == 'r'));
+                                 !(fd->embed_ref>0 && fd->mode == 'r'));
         fn = fd->refs ? fd->refs->fn : NULL;
         if (!fn)
             ret = -1;
@@ -3712,10 +3712,8 @@ void cram_free_container(cram_container *c) {
         kh_destroy(m_tagmap, c->tags_used);
     }
 
-    if (c->ref_free) {
+    if (c->ref_free)
         free(c->ref);
-        free(c->ref_set);
-    }
 
     free(c);
 }
@@ -4850,7 +4848,21 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                 if (!(md5 = hts_md5_init()))
                     return -1;
                 ref = cram_get_ref(fd, i, 1, rlen);
-                if (NULL == ref) return -1;
+                if (NULL == ref) {
+                    if (fd->embed_ref == -1) {
+                        // auto embed-ref
+                        hts_log_warning("No M5 tags present and could not "
+                                        "find reference");
+                        hts_log_warning("Enabling embed_ref=2 option");
+                        hts_log_warning("NOTE: the CRAM file will be bigger "
+                                        "than using an external reference");
+                        pthread_mutex_lock(&fd->ref_lock);
+                        fd->embed_ref = 2;
+                        pthread_mutex_lock(&fd->ref_lock);
+                        break;
+                    }
+                    return -1;
+                }
                 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
                 hts_md5_update(md5, ref, rlen);
                 hts_md5_final(buf, md5);
@@ -5251,7 +5263,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
     fd->seqs_per_slice = SEQS_PER_SLICE;
     fd->bases_per_slice = BASES_PER_SLICE;
     fd->slices_per_container = SLICE_PER_CNT;
-    fd->embed_ref = 0;
+    fd->embed_ref = -1; // automatic selection
     fd->no_ref = 0;
     fd->ap_delta = 0;
     fd->ignore_md5 = 0;
@@ -5398,7 +5410,7 @@ int cram_write_eof_block(cram_fd *fd) {
         //   block CRC
         cram_block_compression_hdr ch;
         memset(&ch, 0, sizeof(ch));
-        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch);
+        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
 
         c.length = c.comp_hdr_block->byte            // Landmark[0]
             + 5                                      // block struct
diff --git a/cram/cram_structs.h b/cram/cram_structs.h
index e03a34e11..16739c2c6 100644
--- a/cram/cram_structs.h
+++ b/cram/cram_structs.h
@@ -474,7 +474,6 @@ struct cram_container {
 
     uint32_t n_mapped;    // Number of mapped reads
     int ref_free;         // whether 'ref' is owned by us and must be freed.
-    char *ref_set;        // same size as ref.  Only set for auto embed_ref
 };
 
 /*

From c72eee6465147278b9dee97044474bca69f89dcb Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 19 Jul 2022 12:03:35 +0100
Subject: [PATCH 52/79] time_funcs include and dependency adjustments

Fix dependency typo in the Makefile

Remove config.h from hts_time_funcs.h as it's likely to be too
late to include it by the time that file it read.  Add config.h
include to test/test_time_funcs.c instead.

Add extra includes to hts_time_funcs.h so it stands on its own
and doesn't rely on the right headers having been included before
it is.
---
 Makefile               | 4 ++--
 hts_time_funcs.h       | 5 ++++-
 test/test_time_funcs.c | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index dfb666c91..42ad91814 100644
--- a/Makefile
+++ b/Makefile
@@ -235,7 +235,7 @@ bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h)
 header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h)
 hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h)
 hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h)
-hts_time_funcs_h = hts_time_funcs.h config.h
+hts_time_funcs_h = hts_time_funcs.h
 sam_internal_h = sam_internal.h $(htslib_sam_h)
 textutils_internal_h = textutils_internal.h $(htslib_kstring_h)
 thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h)
@@ -726,7 +726,7 @@ test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_s
 test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
 test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
 test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h)
-test/test_time_funcs.o: test/test_time_funcs.c $(htslib_time_funcs_h)
+test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h)
 test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h)
 test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h)
 test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h)
diff --git a/hts_time_funcs.h b/hts_time_funcs.h
index bc3de14f8..2a0508412 100644
--- a/hts_time_funcs.h
+++ b/hts_time_funcs.h
@@ -44,7 +44,10 @@ DEALINGS IN THE SOFTWARE.  */
   Non-derived code is copyright as above.
 */
 
-#include <config.h>
+#include <stdint.h>
+#include <limits.h>
+#include <errno.h>
+#include <time.h>
 
 static inline int hts_time_normalise(int *tens, int *units, int base) {
     if (*units < 0 || *units >= base) {
diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c
index e8c2600cc..9ca292f7a 100644
--- a/test/test_time_funcs.c
+++ b/test/test_time_funcs.c
@@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.  */
 
+#include <config.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>

From b7addd31513fd301bc3f44fff9bcdf87239523db Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Wed, 20 Jul 2022 13:08:36 +0100
Subject: [PATCH 53/79] Support 0 coordinate in BCF

The 0 coordinate is valid in VCF specification, but the round-trip
VCF -> BCF -> VCF turns MT:0 into MT:4294967296. Add a check to
detect this overflow.

See #1475 and https://github.com/samtools/bcftools/issues/1753
---
 vcf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vcf.c b/vcf.c
index ab2477861..012db0934 100644
--- a/vcf.c
+++ b/vcf.c
@@ -1413,6 +1413,7 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
     if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
     v->rid  = le_to_i32(x + 8);
     v->pos  = le_to_u32(x + 12);
+    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
     v->rlen = le_to_i32(x + 16);
     v->qual = le_to_float(x + 20);
     v->n_info = le_to_u16(x + 24);

From c5508d59eefb97eac2af6bb00e97b7e64f81f8da Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 21 Jul 2022 15:27:13 +0100
Subject: [PATCH 54/79] Minimal support for CRAM files with missing @RG
 headers.

The SAMtags spec states that RG:Z: lines should point match an RG ID
if RG headers are present, but doesn't explicitly *require* them to be
present.  The SAM spec itself recommends that RG headers are present.
Sadly this means CRAM may need to cope with this semantically
inconsistent edge case.

Given CRAM stores RG as an integer data series as an index into the
corresponding header, in much the same way that BAM stores chromosomes
as numeric "tid" values, this makes things challenging.  However CRAM
can also store text tags, so it's possible to round-trip with missing
headers by claiming RG is -1 (unspecified) and then adding a verbatim
RG:Z string tag.  This is perhaps a bit of a CRAM spec loop hole so
it's questionable if this is the correct solution.

This works and is decodable by both htslib and htsjdk, but it'll break
things like cram_transcode_rg as used by samtools cat.  I think this
is a pretty unlikely combination of events.  Note picard's
SamFormatConverter also drops these RG fields.

This code also whinges, *once for each and every problematic alignment
record*, when RG is absent in the SAM header.  It's considerably more
work to track which ones we've warned about before and to track all
that meta-data across threads in a robust manner, plus this really
could be considered to be a poor SAM file.  Were it not for the SAM
spec explicitly permitting such things (even if recommending against
it) I'd reject it outright.  Instead brow-beating the SAM creators
into fixing the headers could be considered to be a positive outcome.

Fixes #1479
---
 cram/cram_encode.c | 47 ++++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index d3dd7a134..1ba1988f4 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -2534,15 +2534,17 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
  * Encodes auxiliary data. Largely duplicated from above, but done so to
  * keep it simple and avoid a myriad of version ifs.
  *
- * Returns the read-group parsed out of the BAM aux fields on success
+ * Returns the RG header line pointed to by the BAM aux fields on success,
  *         NULL on failure or no rg present, also sets "*err" to non-zero
  */
-static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
-                             cram_slice *s, cram_record *cr,
-                             int verbatim_NM, int verbatim_MD,
-                             int NM, kstring_t *MD, int cf_tag,
-                             int *err) {
-    char *aux, *orig, *rg = NULL;
+static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
+                                      cram_container *c,
+                                      cram_slice *s, cram_record *cr,
+                                      int verbatim_NM, int verbatim_MD,
+                                      int NM, kstring_t *MD, int cf_tag,
+                                      int *err) {
+    char *aux, *orig;
+    sam_hrec_rg_t *brg = NULL;
     int aux_size = bam_get_l_aux(b);
     cram_block *td_b = c->comp_hdr->TD_blk;
     int TD_blk_size = BLOCK_SIZE(td_b), new;
@@ -2577,11 +2579,17 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
 
         // RG:Z
         if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
-            rg = &aux[3];
-            while (*aux++);
-            if (CRAM_MAJOR_VERS(fd->version) >= 4)
-                BLOCK_APPEND(td_b, "RG*", 3);
-            continue;
+            char *rg = &aux[3];
+            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
+            if (brg) {
+                while (*aux++);
+                if (CRAM_MAJOR_VERS(fd->version) >= 4)
+                    BLOCK_APPEND(td_b, "RG*", 3);
+                continue;
+            } else {
+                // RG:Z tag will be stored verbatim
+                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
+            }
         }
 
         // MD:Z
@@ -2938,8 +2946,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
 
     if (err) *err = 0;
 
-    // rg from within bam_aux, not rg from our aux copy.
-    return rg ? (char *)bam_aux(b) + (rg - orig) : NULL;
+    return brg;
 
  err:
  block_err:
@@ -3082,7 +3089,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
                             bam_seq_t *b, int rnum, kstring_t *MD,
                             int embed_ref) {
     int i, fake_qual = -1, NM = 0;
-    char *cp, *rg;
+    char *cp;
     char *ref, *seq, *qual;
 
     // Any places with N in seq and/or reference can lead to ambiguous
@@ -3426,15 +3433,15 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
     int err = 0;
-    rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
-                         cf_tag, &err);
+    sam_hrec_rg_t *brg =
+        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
+                        cf_tag, &err);
     if (err)
         goto block_err;
 
     /* Read group, identified earlier */
-    if (rg) {
-        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
-        cr->rg = brg ? brg->id : -1;
+    if (brg) {
+        cr->rg = brg->id;
     } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
         sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
         if (!brg) goto block_err;

From 542dfb81afd6ebef5a5ea5651555f930d43d572e Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 27 Jul 2022 16:12:29 +0100
Subject: [PATCH 55/79] Fix (un)locking bug introduced in commit 9562aeba

---
 cram/cram_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cram/cram_io.c b/cram/cram_io.c
index 968b7b119..bc5fea915 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -4858,7 +4858,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                                         "than using an external reference");
                         pthread_mutex_lock(&fd->ref_lock);
                         fd->embed_ref = 2;
-                        pthread_mutex_lock(&fd->ref_lock);
+                        pthread_mutex_unlock(&fd->ref_lock);
                         break;
                     }
                     return -1;

From 72dfa6d79fe3a42b81ea35e572031dbd3176b1bd Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 27 Jul 2022 16:54:41 +0100
Subject: [PATCH 56/79] Fix leak of MD5 context when entering embed_ref=2 mode

Delay initialising the context until it's actually needed.
---
 cram/cram_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cram/cram_io.c b/cram/cram_io.c
index bc5fea915..5d01e1318 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -4845,8 +4845,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                     return -1;
                 }
                 rlen = fd->refs->ref_id[i]->length;
-                if (!(md5 = hts_md5_init()))
-                    return -1;
                 ref = cram_get_ref(fd, i, 1, rlen);
                 if (NULL == ref) {
                     if (fd->embed_ref == -1) {
@@ -4864,6 +4862,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                     return -1;
                 }
                 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
+                if (!(md5 = hts_md5_init()))
+                    return -1;
                 hts_md5_update(md5, ref, rlen);
                 hts_md5_final(buf, md5);
                 hts_md5_destroy(md5);

From 66a7fad49a8d11d6f66a6cf805d7820dece97a53 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 27 Jul 2022 16:28:32 +0100
Subject: [PATCH 57/79] Add an embed_ref=2 auto-mode test

---
 test/test.pl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/test.pl b/test/test.pl
index 514f2508a..455aef18e 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -637,6 +637,14 @@ sub test_view
             testv $opts, "./compare_sam.pl -Baux $md $sam $jsam";
         }
 
+        # embed_ref=2 mode
+        my $ersam = "ce#1000.sam";
+        my $ercram = "ce#1000_er.tmp.cram";
+        my $ersam2 = "${ercram}.sam";
+        testv $opts, "./test_view $tv_args -C -p $ercram $ersam";
+        testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
+        testv $opts, "./compare_sam.pl $ersam $ersam2";
+
         if ($test_view_failures == 0)
         {
             passed($opts, "$sam conversions");

From e92076cd1491545f822d1455c1aa7c4286635541 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 29 Jul 2022 10:26:16 +0100
Subject: [PATCH 58/79] Report HTS_CFLAGS_AVX2 et al in `make print-config`

This information (probed for by configure) may also be of use
to third parties.
---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 42ad91814..bd020bd21 100644
--- a/Makefile
+++ b/Makefile
@@ -326,6 +326,10 @@ libhts.a: $(LIBHTS_OBJS)
 	-$(RANLIB) $@
 
 print-config:
+	@echo HTS_CFLAGS_AVX2 = $(HTS_CFLAGS_AVX2)
+	@echo HTS_CFLAGS_AVX512 = $(HTS_CFLAGS_AVX512)
+	@echo HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSE4)
+	@echo HTS_HAVE_NEON = $(HTS_HAVE_NEON)
 	@echo LDFLAGS = $(LDFLAGS)
 	@echo LIBHTS_OBJS = $(LIBHTS_OBJS)
 	@echo LIBS = $(LIBS)

From 1858eed9c959a0cbb51b71bd8468dd78ca0f82e8 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <aw7@sanger.ac.uk>
Date: Wed, 27 Jul 2022 11:31:09 +0100
Subject: [PATCH 59/79] Summer 2022 copyright corrections.

---
 configure.ac         | 2 +-
 cram/cram_external.c | 2 +-
 hfile_s3.c           | 2 +-
 hts.c                | 2 +-
 hts_expr.c           | 2 +-
 htscodecs_bundled.mk | 2 +-
 htslib-s3-plugin.7   | 2 +-
 htslib/bgzf.h        | 2 +-
 htslib/cram.h        | 2 +-
 htslib/hfile.h       | 2 +-
 htslib/hts.h         | 2 +-
 htslib/hts_expr.h    | 2 +-
 htslib/knetfile.h    | 2 +-
 htslib/kstring.h     | 2 +-
 htslib/sam.h         | 2 +-
 sam.c                | 2 +-
 test/sam.c           | 2 +-
 test/test.pl         | 2 +-
 test/test_expr.c     | 2 +-
 test/test_mod.c      | 2 +-
 version.sh           | 2 +-
 21 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/configure.ac b/configure.ac
index a53c08fe3..b848dc633 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
 # Configure script for htslib, a C library for high-throughput sequencing data.
 #
-#    Copyright (C) 2015-2021 Genome Research Ltd.
+#    Copyright (C) 2015-2022 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
diff --git a/cram/cram_external.c b/cram/cram_external.c
index 329f1ec63..e88ff838b 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015, 2018-2020 Genome Research Ltd.
+Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
diff --git a/hfile_s3.c b/hfile_s3.c
index c9bed1fe1..ce83875c9 100644
--- a/hfile_s3.c
+++ b/hfile_s3.c
@@ -1,6 +1,6 @@
 /*  hfile_s3.c -- Amazon S3 backend for low-level file streams.
 
-    Copyright (C) 2015-2017, 2019-2021 Genome Research Ltd.
+    Copyright (C) 2015-2017, 2019-2022 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
diff --git a/hts.c b/hts.c
index c2c4acb89..03809a11b 100644
--- a/hts.c
+++ b/hts.c
@@ -1,6 +1,6 @@
 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
 
-    Copyright (C) 2008, 2009, 2012-2021 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd.
     Copyright (C) 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
diff --git a/hts_expr.c b/hts_expr.c
index 74fe85fce..21d768d67 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -1,6 +1,6 @@
 /*  hts_expr.c -- filter expression parsing and processing.
 
-    Copyright (C) 2020-2021 Genome Research Ltd.
+    Copyright (C) 2020-2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk
index 64192f085..91a9c39e9 100644
--- a/htscodecs_bundled.mk
+++ b/htscodecs_bundled.mk
@@ -1,6 +1,6 @@
 # Makefile fragment to add settings needed when bundling htscodecs functions
 #
-#    Copyright (C) 2021 Genome Research Ltd.
+#    Copyright (C) 2021-2022 Genome Research Ltd.
 #
 #    Author: Rob Davies <rmd@sanger.ac.uk>
 #
diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7
index 359c0fc35..d70626832 100644
--- a/htslib-s3-plugin.7
+++ b/htslib-s3-plugin.7
@@ -2,7 +2,7 @@
 .SH NAME
 s3 plugin \- htslib AWS S3 plugin
 .\"
-.\" Copyright (C) 2021 Genome Research Ltd.
+.\" Copyright (C) 2021-2022 Genome Research Ltd.
 .\"
 .\" Author: Andrew Whitwham <aw7@sanger.ac.uk>
 .\"
diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index 24d787bdf..c4ba85679 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -3,7 +3,7 @@
 /*
    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
                  2011, 2012 Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd
+   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022 Genome Research Ltd
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
diff --git a/htslib/cram.h b/htslib/cram.h
index afeeb3711..8dc6fe1b3 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -1,7 +1,7 @@
 /// @file htslib/cram.h
 /// CRAM format-specific API functions.
 /*
-    Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd.
+    Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/htslib/hfile.h b/htslib/hfile.h
index 92b789acd..6e3a2a22a 100644
--- a/htslib/hfile.h
+++ b/htslib/hfile.h
@@ -1,7 +1,7 @@
 /// @file htslib/hfile.h
 /// Buffered low-level input/output streams.
 /*
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2022 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
diff --git a/htslib/hts.h b/htslib/hts.h
index d354b2e2c..5fb968fd0 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -1,7 +1,7 @@
 /// @file htslib/hts.h
 /// Format-neutral I/O, indexing, and iterator API functions.
 /*
-    Copyright (C) 2012-2021 Genome Research Ltd.
+    Copyright (C) 2012-2022 Genome Research Ltd.
     Copyright (C) 2010, 2012 Broad Institute.
     Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk>
 
diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h
index 7e6a9ed2b..2abdddc24 100644
--- a/htslib/hts_expr.h
+++ b/htslib/hts_expr.h
@@ -1,6 +1,6 @@
 /*  expr.c -- filter expression parsing and processing.
 
-    Copyright (C) 2020 Genome Research Ltd.
+    Copyright (C) 2020, 2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/htslib/knetfile.h b/htslib/knetfile.h
index cfddd6b67..0f2adec83 100644
--- a/htslib/knetfile.h
+++ b/htslib/knetfile.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008, 2012, 2014, 2021 Genome Research Ltd (GRL).
+   Copyright (c) 2008, 2012, 2014, 2021-2022 Genome Research Ltd (GRL).
                  2010 by Attractive Chaos <attractor@live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
diff --git a/htslib/kstring.h b/htslib/kstring.h
index 09bc9e3d9..53a19806d 100644
--- a/htslib/kstring.h
+++ b/htslib/kstring.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2013-2014, 2016, 2018-2020 Genome Research Ltd.
+   Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
diff --git a/htslib/sam.h b/htslib/sam.h
index f0a191a28..5f8c0a554 100644
--- a/htslib/sam.h
+++ b/htslib/sam.h
@@ -1,7 +1,7 @@
 /// @file htslib/sam.h
 /// High-level SAM/BAM/CRAM sequence file operations.
 /*
-    Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd.
     Copyright (C) 2010, 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
diff --git a/sam.c b/sam.c
index fccff262b..3d479e1ed 100644
--- a/sam.c
+++ b/sam.c
@@ -1,6 +1,6 @@
 /*  sam.c -- SAM and BAM file I/O and manipulation.
 
-    Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd.
+    Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd.
     Copyright (C) 2010, 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
diff --git a/test/sam.c b/test/sam.c
index 49d9210c3..036349f2b 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -1,6 +1,6 @@
 /*  test/sam.c -- SAM/BAM/CRAM API test cases.
 
-    Copyright (C) 2014-2020 Genome Research Ltd.
+    Copyright (C) 2014-2020, 2022 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
diff --git a/test/test.pl b/test/test.pl
index 455aef18e..d6c01786a 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl
 #
-#    Copyright (C) 2012-2021 Genome Research Ltd.
+#    Copyright (C) 2012-2022 Genome Research Ltd.
 #
 #    Author: Petr Danecek <pd3@sanger.ac.uk>
 #
diff --git a/test/test_expr.c b/test/test_expr.c
index 15e25cf4b..641e89041 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -1,6 +1,6 @@
 /*  test-expr.c -- Testing: filter expression parsing and processing.
 
-    Copyright (C) 2020 Genome Research Ltd.
+    Copyright (C) 2020, 2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/test/test_mod.c b/test/test_mod.c
index f6f5b0718..3facf5dba 100644
--- a/test/test_mod.c
+++ b/test/test_mod.c
@@ -1,6 +1,6 @@
 /*  test/test_mod.c -- testing of base modification functions
 
-    Copyright (C) 2020 Genome Research Ltd.
+    Copyright (C) 2020-2021 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/version.sh b/version.sh
index 6e6eff016..a13c6df88 100755
--- a/version.sh
+++ b/version.sh
@@ -3,7 +3,7 @@
 #
 #     Author : James Bonfield <jkb@sanger.ac.uk>
 #
-#     Copyright (C) 2017-2018 Genome Research Ltd.
+#     Copyright (C) 2017-2018, 2021 Genome Research Ltd.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

From 3e289217be0e26a9722c51fe129dc98281214659 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 1 Aug 2022 17:02:02 +0100
Subject: [PATCH 60/79] Add a mapping of Zlib to Libdeflate compression levels
 for BGZF.

Libdeflate goes up to compression level 12, with the last 3 levels
using a much slower optimal parsing technique.  We reserve bgzip
levels 8 and 9 for two of these slow modes, and spread the remainder
out across levels libdeflate 1-9.

We map 1-9 to 1,2,3, 5,6,7,8, 10,12.
This was designed so that the files are generally smaller than their
zlib counterparts while still being faster (except for zlib levels 8
and 9 as noted above).

This is based on benchmarks (see below) for various data sets.

Hence users will find bgzf -l8 and -l9 considerably slower than
before.  Ideally we'd support bgzip -l10 to -l12, but this complicates
several tools and the htslib format string which assumes it's
level+'0' in various places (not just the library, but also the
command line tools).  This was the simpler and safer option.
Realistically no one uses level 9 unless they want maximum
compression, and now they're getting it once again.

Fixes #1477

CPU time (threaded, but total user CPU via time -f "%U") and file size
in bytes.

1. 1GB of Illumina NovaSeq BAM (NovaSeq.10m.bam)

           Libdeflate              Zlib
    0      0.99    1000474917      0.68    1000474917
    1      9.80    183521324       20.29   213827245       >1
    2      14.17   179046201       21.87   205485380       >1
    3      15.20   175877610       26.67   195469541       >1
    4      16.28   172991407       29.80   176019215       ~3
    5      19.36   169087724       38.52   169202888       ~4
    6      23.27   165900144       56.30   164719424       >7
    7      32.50   163766923       72.45   163327258       ~7
    8      57.16   161643808       148.61  160866537       ~9
    9      74.91   160953697       295.37  159689582       >10
    10     303.28  157126803
    11     477.66  155323612
    12     659.36  153756096

As an experiment I added zstd here too, with various block sizes or
none at all, at level 9, 12 and 19:
    -9 (unblocked)  146938203
    -B1048576 -b9   149801386
    -B65536 -b9     160527533 (in ~29s)
    -B5536 -b12     157251800 (best 64k blocked zstd, in ~95s)
    -B5536 -b19     144923698 (best 64k blocked zstd, in ~617s)

-----------------------------------------------------------------------------
1GB of ~4000 sample VCF
ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

           libdeflate               zlib
    0      0.66     1000474917      0.70    1000474917
    1      4.29     23702390        6.50    31624075        >1
    2      6.15     22689795        6.96    30023229        >1
    3      6.60     22086968        8.37    28216644        >1
    4      6.89     21741126        13.88   21861705        ~4
    5      7.26     21312709        15.91   20967100        >6
    6      8.29     20575904        21.01   19966632        ~7
    7      11.67    19817402        27.06   19529414        >8
    8      20.15    19082333        59.78   18349376        >10
    9      26.82    18536813        104.85  17957219        >10
    10     104.32   17748987
    11     168.52   17297625
    12     273.45   16916786

-----------------------------------------------------------------------------
1GB of 1 sample VCF; many verbose INFO fields
HG002_GRCh38_1_22_v4.2_benchmark.vcf.gz

           libdeflate               zlib
    0      0.66     1000474917      0.73    1000474917
    1      5.17     89779592        9.36    88208451        >2
    2      7.87     78071190        9.09    76125060        >3
    3      9.16     70155649        8.91    71063897        >3
    4      8.00     61555631        17.29   67974286        >4
    5      9.26     59090268        18.28   59819372        >5
    6      11.24    56259793        20.25   55524441        >7
    7      13.33    53421543        21.65   54944394        >7
    8      19.12    51953892        26.07   53725388        >7
    9      21.17    51870998        28.17   53714621        >7
    10     140.56   52822252
    11     240.26   50724685
    12     452.39   50135214

There is some oddity here with libdeflate level 10 being poorer than
level 8 while still being much slower!  This is probably some quirk of
excessive data redundancy.

This data really shows the benefit of zstd instead.  Such highly
redundant data hugely speeds up with zstd -9 taking approx 8s to
encode (when using 64KB blocks) at a size of 49792035, so libdeflate
lvl 5 speeds at better than libdeflate size.  With block sizes of 1MB
that drops from ~50MB to ~39MB too.  (Zstd doesn't help nearly as much
on the other data sets, so this is likely an excessive redundancy
thing.)

-----------------------------------------------------------------------------
Single sample GIAB chr1 bcftools output (85MB worth); more succint

           libdeflate               zlib
    0      0.04     85086248        0.08    85086248
    1      0.84     18635065        1.61    21071717        >1
    2      1.38     17914684        1.84    20227900        >1
    3      1.44     17581996        2.05    19477263        >1
    4      1.57     17231445        2.58    18139642        >2
    5      1.87     16065221        3.06    16855325        >5
    6      2.24     15550661        4.05    16433775        >5
    7      2.90     15265882        4.63    16084926        >5
    8      4.99     14650624        7.45    15784360        >6
    9      5.86     14615736        7.49    15778425        >6
    10     21.15    14238504
    11     29.47    14188808
    12     35.88    14180049

-----------------------------------------------------------------------------
1GB of R10 ONT fastq

           libdeflate               zlib
    0      0.60     1000474917      0.68    1000474917
    1      17.48    507151512       39.93   522788722       >1
    2      27.29    490892251       42.81   513743184       >1
    3      32.00    486330031       52.95   506652197       ~1
    4      38.66    483198216       54.37   501296019       >2
    5      43.60    479970547       78.50   498174825       >2
    6      60.20    478146566       137.41  494118811       >2
    7      87.72    476998683       196.25  493161028       >2
    8      109.78   476648496       249.70  493162311       >2
    9      110.21   476658031       249.02  493162210       >2
    10     213.13   459363967
    11     250.79   457698243
    12     287.86   457132946
---
 bgzf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bgzf.c b/bgzf.c
index e72ed566d..a969b1567 100644
--- a/bgzf.c
+++ b/bgzf.c
@@ -574,6 +574,8 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le
     } else {
         level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
         // NB levels go up to 12 here.
+        int lvl_map[] = {0,1,2,3,5,6,7,8,10,12};
+        level = lvl_map[level>9 ?9 :level];
         struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
         if (!z) return -1;
 

From ad80f8e8ee582377d998d5aea9f65975f9bb3322 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 29 Jul 2022 11:53:01 +0100
Subject: [PATCH 61/79] Catch arrays of BCF_BT_NULL in bcf_record_check()

Adds a check for INFO/FORMAT values with type 0 ("A MISSING value
without an explicit type provided") and number of elements > 0.

Normally type = 0 and number = 0 is used for a completely
missing INFO/FORMAT value.  In theory you could use number > 0
to make an array of missing values (at least the specification
doesn't explicitly disallow it), however trying to do so results
in bcf_fmt_array() reporting "Unexpected type 0" and calling
exit(1).  HTSJDK also appears to reject this encoding, and
nothing appears to write it, so it seems reasonable to say that
it's not valid.

The check for this encoding is added to bcf_record_check()
so it's caught well before the data gets near bcf_fmt_array().
It also avoids problems with bcf_type_shift[], which cannot
report the correct size of type = 0 encodings (it would need
to multiply by 0, which isn't stricty possible with a shift).

Credit to OSS-Fuzz
Fixes oss-fuzz 49091
---
 vcf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vcf.c b/vcf.c
index 012db0934..aa7c558be 100644
--- a/vcf.c
+++ b/vcf.c
@@ -1599,7 +1599,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
             err |= BCF_ERR_TAG_UNDEF;
         }
         if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
-        if (((1 << type) & is_valid_type) == 0) {
+        if (((1 << type) & is_valid_type) == 0
+            || (type == BCF_BT_NULL && num > 0)) {
             if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
                 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
             err |= BCF_ERR_TAG_INVALID;
@@ -1623,7 +1624,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
             err |= BCF_ERR_TAG_UNDEF;
         }
         if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
-        if (((1 << type) & is_valid_type) == 0) {
+        if (((1 << type) & is_valid_type) == 0
+            || (type == BCF_BT_NULL && num > 0)) {
             bcf_record_check_err(hdr, rec, "type", &reports, type);
             err |= BCF_ERR_TAG_INVALID;
         }

From c27092616fd0b11ea6955dbc337d0f581ace8d72 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 14 Jul 2022 10:33:38 +0100
Subject: [PATCH 62/79] Sanitize the 3-state logic for hts_expr_t, plus added
 some functions.

1. We have strings, numerics, and a null/unknown state.  The latter
was previously only used as a mechanism for reporting the presence of
absence of a tag, eg "[X1]" being the value of X1 if found, or null if
not.  We also had an override of is_true=1 to permit "![X1]", although
sadly this wasn't robust (and still isn't) as it has a dual meaning of
X1 not existing or X1 existing but being zero.

Our unknown state now uses NaN semantics as defined by IEE 754 for
comparisons and mathematics, but not for conditionals (see
below). This means unknown+2 is still unknown (and false).  Previously
"0/0+2" was true/-nan, it's now false/nan.

2. NaN semantics means <, >, == and != all return false with null/nan
values, even when comparing null to null.  Note this changes the
language results slightly.

INCOMPATIBILITY: Previously "[X1] != 0" meant X1 tag exists and is
zero, or X1 tag does not exist.  To avoid that second clause the man
page recommends "[X1] && [X1] != 0" to add a clause of checking the
tag for existance first.

This was illogical and almost certainly not the intended outcome.  Now
!= will be false whenever tag [X1] does not exist so the expression is
only true when the value is defined.  (The man page expression still
works, but has a redundant component.)

Similarly "![X1]" now means X1 doesn't exist, rather than the previous
interpretation of doesn't exist or is zero.

3. Fix arithmetic on non-existant aux tags.

Undefined values are now considered to be false.  They defined as
either null strings or NaN doubles, although we use the latter
ourselves.  (The former are considered the same as the previous code
used this and possibly external methods copying our old style).  Note
for compatibility with before the empty string is not false.

Previously attempting to use an undefined value gave a warning
message, so expressions like "[X0] + [X1] > 10" would spam when X0 or
X1 were absent.

Added a hts_expr_val_exists() function to simplify testing for defined
values, hts_expr_val_existsT() for defined or undef-but-true (useful
in conditionals), and hts_expr_val_undef() to set a variable to be
undefined (used when invalidating things).

4. Added an exists() function.  "[X0]" is a synonym for
X0-exists, and "![X0]" as doesn't exist, but sadly previously "![X0]"
was interpreted as "X0-doesn't-exist or X0==0".  Given this change and
for general clarity, a less ambiguous explicit exists function has
been added.

Note "exists" means has a known value or has an explicit is_true==1.
So null-but-true is still "exists".  Hence tags XX:f:nan are
considered to exist.

5. Added a default(a,b) function where "a" is returned if defined,
otherwise "b".  As previously explained the expression "[X0] + [X1] > 10"
is false whenever X0 or X1 don't exist, but using
"default([X0],0) + default([X1],0) > 10" we can use the sum of the
values present, or a single value if one is absent.

6. Added mathematical functions of sqrt, log, exp, and pow.

7. Null and boolean operations are largely unchanged, but for
clarification they work as follow, with 0/1 also being false/true and
symmetric operations.

null           == NaN (false)
null-but-true  == NaN (true)
null && x      == 0
null || 0      == 0
null || 1      == 1
!null          == 1
!!null         == 0
!null-but-true == 0

Although we're using NaN internally in order to get the arithmetic
consistent, it's not good to assume we rigidly follows all NaN
semantics.  Specifically in C NaN is considered to be true (so "NaN &&
1" is true), but for us it is false.
---
 hts_expr.c        | 296 +++++++++++++++++++++++++++++++--------
 htslib/hts_expr.h |  47 ++++++-
 test/test_expr.c  | 344 +++++++++++++++++++++++++++++-----------------
 3 files changed, 496 insertions(+), 191 deletions(-)

diff --git a/hts_expr.c b/hts_expr.c
index 21d768d67..5e5a132ea 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -23,7 +23,6 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.  */
 
 // TODO:
-// - add maths functions.  pow, sqrt, log, ?
 // - ?: operator for conditionals?
 
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
@@ -163,10 +162,52 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         }
         break;
 
+    case 'd':
+        if (strncmp(str, "default(", 8) == 0) {
+            if (expression(filt, data, fn, str+8, end, res)) return -1;
+            if (**end != ',')
+                return -1;
+            (*end)++;
+            hts_expr_val_t val = HTS_EXPR_VAL_INIT;
+            if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
+            func_ok = 1;
+            if (!hts_expr_val_existsT(res)) {
+                kstring_t swap = res->s;
+                *res = val;
+                val.s = swap;
+                hts_expr_val_free(&val);
+            }
+        }
+        break;
+
+    case 'e':
+        if (strncmp(str, "exists(", 7) == 0) {
+            if (expression(filt, data, fn, str+7, end, res)) return -1;
+            func_ok = 1;
+            res->is_true = res->d = hts_expr_val_existsT(res);
+            res->is_str = 0;
+        } else if (strncmp(str, "exp(", 4) == 0) {
+            if (expression(filt, data, fn, str+4, end, res)) return -1;
+            func_ok = 1;
+            res->d = exp(res->d);
+            res->is_str = 0;
+            if (isnan(res->d))
+                hts_expr_val_undef(res);
+        }
+
+        break;
+
     case 'l':
         if (strncmp(str, "length(", 7) == 0) {
             if (expression(filt, data, fn, str+7, end, res)) return -1;
             func_ok = expr_func_length(res);
+        } else if (strncmp(str, "log(", 4) == 0) {
+            if (expression(filt, data, fn, str+4, end, res)) return -1;
+            func_ok = 1;
+            res->d = log(res->d);
+            res->is_str = 0;
+            if (isnan(res->d))
+                hts_expr_val_undef(res);
         }
         break;
 
@@ -179,6 +220,44 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             func_ok = expr_func_max(res);
         }
         break;
+
+    case 'p':
+        if (strncmp(str, "pow(", 4) == 0) {
+            if (expression(filt, data, fn, str+4, end, res)) return -1;
+            func_ok = 1;
+
+            if (**end != ',')
+                return -1;
+            (*end)++;
+            hts_expr_val_t val = HTS_EXPR_VAL_INIT;
+            if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
+            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+                hts_expr_val_undef(res);
+            } else if (res->is_str || val.is_str) {
+                hts_expr_val_free(&val); // arith on strings
+                return -1;
+            } else {
+                func_ok = 1;
+                res->d = pow(res->d, val.d);
+                hts_expr_val_free(&val);
+                res->is_str = 0;
+            }
+
+            if (isnan(res->d))
+                hts_expr_val_undef(res);
+        }
+        break;
+
+    case 's':
+        if (strncmp(str, "sqrt(", 5) == 0) {
+            if (expression(filt, data, fn, str+5, end, res)) return -1;
+            func_ok = 1;
+            res->d = sqrt(res->d);
+            res->is_str = 0;
+            if (isnan(res->d))
+                hts_expr_val_undef(res);
+        }
+        break;
     }
 
     if (func_ok < 0)
@@ -286,32 +365,46 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
                       char *str, char **end, hts_expr_val_t *res) {
     int err;
     str = ws(str);
-    if (*str == '+') {
-        err = simple_expr(filt, data, fn, str+1, end, res);
-        err |= res->is_str;
-        res->is_true = res->d != 0;
-    } else if (*str == '-') {
+    if (*str == '+' || *str == '-') {
         err = simple_expr(filt, data, fn, str+1, end, res);
-        err |= res->is_str;
-        res->d = -res->d;
-        res->is_true = res->d != 0;
+        if (!hts_expr_val_exists(res)) {
+            hts_expr_val_undef(res);
+        } else {
+            err |= res->is_str;
+            if (*str == '-')
+                res->d = -res->d;
+            res->is_true = res->d != 0;
+        }
     } else if (*str == '!') {
         err = unary_expr(filt, data, fn, str+1, end, res);
         if (res->is_true) {
+            // Any explicitly true value becomes false
             res->d = res->is_true = 0;
-            res->is_str = 0;
+        } else if (!hts_expr_val_exists(res)) {
+            // We can also still negate undef values by toggling the
+            // is_true override value.
+            res->d = res->is_true = !res->is_true;
         } else if (res->is_str) {
-            res->is_str = 0;
+            // !null = true, !"foo" = false, NOTE: !"" = false also
             res->d = res->is_true = (res->s.s == NULL);
         } else {
             res->d = !(int64_t)res->d;
             res->is_true = res->d != 0;
         }
+        res->is_str = 0;
     } else if (*str == '~') {
         err = unary_expr(filt, data, fn, str+1, end, res);
-        err |= res->is_str;
-        res->d = ~(int64_t)res->d;
-        res->is_true = res->d != 0;
+        if (!hts_expr_val_exists(res)) {
+            hts_expr_val_undef(res);
+        } else {
+            err |= res->is_str;
+            if (!hts_expr_val_exists(res)) {
+                hts_expr_val_undef(res);
+            } else {
+                res->d = ~(int64_t)res->d;
+                res->is_true = res->d != 0;
+            }
+        }
     } else {
         err = simple_expr(filt, data, fn, str, end, res);
     }
@@ -338,7 +431,9 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         str = ws(str);
         if (*str == '*' || *str == '/' || *str == '%') {
             if (unary_expr(filt, data, fn, str+1, end, &val)) return -1;
-            if (val.is_str || res->is_str) {
+            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
+                hts_expr_val_undef(res);
+            } else if (val.is_str || res->is_str) {
                 hts_expr_val_free(&val);
                 return -1; // arith on strings
             }
@@ -348,12 +443,15 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             res->d *= val.d;
         else if (*str == '/')
             res->d /= val.d;
-        else if (*str == '%')
-            res->d = (int64_t)res->d % (int64_t)val.d;
-        else
+        else if (*str == '%') {
+            if (val.d)
+                res->d = (int64_t)res->d % (int64_t)val.d;
+            else
+                hts_expr_val_undef(res);
+        } else
             break;
 
-        res->is_true = res->d != 0;
+        res->is_true = hts_expr_val_exists(res) && (res->d != 0);
         str = *end;
     }
 
@@ -378,9 +476,12 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
     while (*str) {
         str = ws(str);
+        int undef = 0;
         if (*str == '+' || *str == '-') {
             if (mul_expr(filt, data, fn, str+1, end, &val)) return -1;
-            if (val.is_str || res->is_str) {
+            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
+                undef = 1;
+            } else if (val.is_str || res->is_str) {
                 hts_expr_val_free(&val);
                 return -1; // arith on strings
             }
@@ -393,7 +494,11 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         else
             break;
 
-        res->is_true = res->d != 0;
+        if (undef)
+            hts_expr_val_undef(res);
+        else
+            res->is_true = res->d != 0;
+
         str = *end;
     }
 
@@ -412,11 +517,14 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
     if (add_expr(filt, data, fn, str, end, res)) return -1;
 
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
+    int undef = 0;
     for (;;) {
         str = ws(*end);
         if (*str == '&' && str[1] != '&') {
             if (add_expr(filt, data, fn, str+1, end, &val)) return -1;
-            if (res->is_str || val.is_str) {
+            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
+                undef = 1;
+            } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
             }
@@ -426,6 +534,8 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         }
     }
     hts_expr_val_free(&val);
+    if (undef)
+        hts_expr_val_undef(res);
 
     return 0;
 }
@@ -440,11 +550,14 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
     if (bitand_expr(filt, data, fn, str, end, res)) return -1;
 
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
+    int undef = 0;
     for (;;) {
         str = ws(*end);
         if (*str == '^') {
             if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1;
-            if (res->is_str || val.is_str) {
+            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
+                undef = 1;
+            } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
             }
@@ -454,6 +567,8 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         }
     }
     hts_expr_val_free(&val);
+    if (undef)
+        hts_expr_val_undef(res);
 
     return 0;
 }
@@ -468,11 +583,14 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
     if (bitxor_expr(filt, data, fn, str, end, res)) return -1;
 
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
+    int undef = 0;
     for (;;) {
         str = ws(*end);
         if (*str == '|' && str[1] != '|') {
             if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1;
-            if (res->is_str || val.is_str) {
+            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
+                undef = 1;
+            } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
             }
@@ -482,6 +600,8 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         }
     }
     hts_expr_val_free(&val);
+    if (undef)
+        hts_expr_val_undef(res);
 
     return 0;
 }
@@ -500,33 +620,60 @@ static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
 
     str = ws(*end);
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
-    int err = 0;
+    int err = 0, cmp_done = 0;
 
     if (*str == '>' && str[1] == '=') {
+        cmp_done = 1;
         err = cmp_expr(filt, data, fn, str+2, end, &val);
-        res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s
-            ? strcmp(res->s.s, val.s.s) >= 0
-            : !res->is_str && !val.is_str && res->d >= val.d;
-        res->is_str = 0;
+        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+            hts_expr_val_undef(res);
+        } else {
+            res->is_true=res->d
+                = res->is_str && res->s.s && val.is_str && val.s.s
+                ? strcmp(res->s.s, val.s.s) >= 0
+                : !res->is_str && !val.is_str && res->d >= val.d;
+            res->is_str = 0;
+        }
     } else if (*str == '>') {
+        cmp_done = 1;
         err = cmp_expr(filt, data, fn, str+1, end, &val);
-        res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s
-            ? strcmp(res->s.s, val.s.s) > 0
-            : !res->is_str && !val.is_str && res->d > val.d;
-        res->is_str = 0;
+        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+            hts_expr_val_undef(res);
+        } else {
+            res->is_true=res->d
+                = res->is_str && res->s.s && val.is_str && val.s.s
+                ? strcmp(res->s.s, val.s.s) > 0
+                : !res->is_str && !val.is_str && res->d > val.d;
+            res->is_str = 0;
+        }
     } else if (*str == '<' && str[1] == '=') {
+        cmp_done = 1;
         err = cmp_expr(filt, data, fn, str+2, end, &val);
-        res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s
-            ? strcmp(res->s.s, val.s.s) <= 0
-            : !res->is_str && !val.is_str && res->d <= val.d;
-        res->is_str = 0;
+        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+            hts_expr_val_undef(res);
+        } else {
+            res->is_true=res->d
+                = res->is_str && res->s.s && val.is_str && val.s.s
+                ? strcmp(res->s.s, val.s.s) <= 0
+                : !res->is_str && !val.is_str && res->d <= val.d;
+            res->is_str = 0;
+        }
     } else if (*str == '<') {
+        cmp_done = 1;
         err = cmp_expr(filt, data, fn, str+1, end, &val);
-        res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s
-            ? strcmp(res->s.s, val.s.s) < 0
-            : !res->is_str && !val.is_str && res->d < val.d;
-        res->is_str = 0;
+        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+            hts_expr_val_undef(res);
+        } else {
+            res->is_true=res->d
+                = res->is_str && res->s.s && val.is_str && val.s.s
+                ? strcmp(res->s.s, val.s.s) < 0
+                : !res->is_str && !val.is_str && res->d < val.d;
+            res->is_str = 0;
+        }
     }
+
+    if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)))
+        hts_expr_val_undef(res);
     hts_expr_val_free(&val);
 
     return err ? -1 : 0;
@@ -546,34 +693,45 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
 
     str = ws(*end);
 
-    int err = 0;
+    int err = 0, eq_done = 0;
     hts_expr_val_t val = HTS_EXPR_VAL_INIT;
 
     // numeric vs numeric comparison is as expected
     // string vs string comparison is as expected
     // numeric vs string is false
     if (str[0] == '=' && str[1] == '=') {
+        eq_done = 1;
         if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
             res->is_true = res->d = 0;
         } else {
-            res->is_true = res->d = res->is_str
-                ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0)
-                : !res->is_str && !val.is_str && res->d == val.d;
+            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+                hts_expr_val_undef(res);
+            } else {
+                res->is_true = res->d = res->is_str
+                    ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0)
+                    : !res->is_str && !val.is_str && res->d == val.d;
+            }
         }
         res->is_str = 0;
 
     } else if (str[0] == '!' && str[1] == '=') {
+        eq_done = 1;
         if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
             res->is_true = res->d = 0;
         } else {
-            res->is_true = res->d = res->is_str
-                ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1)
-                : res->is_str != val.is_str || res->d != val.d;
+            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
+                hts_expr_val_undef(res);
+            } else {
+                res->is_true = res->d = res->is_str
+                    ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1)
+                    : res->is_str != val.is_str || res->d != val.d;
+            }
         }
         res->is_str = 0;
 
     } else if ((str[0] == '=' && str[1] == '~') ||
                (str[0] == '!' && str[1] == '~')) {
+        eq_done = 1;
         err = eq_expr(filt, data, fn, str+2, end, &val);
         if (!val.is_str || !res->is_str) {
             hts_expr_val_free(&val);
@@ -614,6 +772,9 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         }
         res->is_str = 0;
     }
+
+    if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res)))
+        hts_expr_val_undef(res);
     hts_expr_val_free(&val);
 
     return err ? -1 : 0;
@@ -634,16 +795,37 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
         str = ws(*end);
         if (str[0] == '&' && str[1] == '&') {
             if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
-            res->is_true = res->d =
-                (res->is_true || (res->is_str && res->s.s) || res->d) &&
-                (val.is_true  || (val.is_str && val.s.s) || val.d);
-            res->is_str = 0;
+            if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) {
+                hts_expr_val_undef(res);
+                res->d = 0;
+            } else {
+                res->is_true = res->d =
+                    (res->is_true || (res->is_str && res->s.s) || res->d) &&
+                    (val.is_true  || (val.is_str && val.s.s) || val.d);
+                res->is_str = 0;
+            }
         } else if (str[0] == '|' && str[1] == '|') {
             if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
-            res->is_true = res->d =
-                res->is_true || (res->is_str && res->s.s) || res->d ||
-                val.is_true  || (val.is_str  && val.s.s ) || val.d;
-            res->is_str = 0;
+            if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) {
+                // neither defined
+                hts_expr_val_undef(res);
+                res->d = 0;
+            } else if (!hts_expr_val_existsT(res) &&
+                       !(val.is_true  || (val.is_str  && val.s.s ) || val.d)) {
+                // LHS undef and RHS false
+                hts_expr_val_undef(res);
+                res->d = 0;
+            } else if (!hts_expr_val_existsT(&val) &&
+                       !(res->is_true || (res->is_str && res->s.s) || res->d)){
+                // RHS undef and LHS false
+                hts_expr_val_undef(res);
+                res->d = 0;
+            } else {
+                res->is_true = res->d =
+                    res->is_true || (res->is_str && res->s.s) || res->d ||
+                    val.is_true  || (val.is_str  && val.s.s ) || val.d;
+                res->is_str = 0;
+            }
         } else {
             break;
         }
@@ -705,7 +887,7 @@ static int hts_filter_eval_(hts_filter_t *filt,
     if (res->is_str) {
         res->is_true |= res->s.s != NULL;
         res->d = res->is_true;
-    } else {
+    } else if (hts_expr_val_exists(res)) {
         res->is_true |= res->d != 0;
     }
 
diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h
index 2abdddc24..43da89d6a 100644
--- a/htslib/hts_expr.h
+++ b/htslib/hts_expr.h
@@ -25,18 +25,30 @@ DEALINGS IN THE SOFTWARE.  */
 #ifndef HTS_EXPR_H
 #define HTS_EXPR_H
 
+#include <math.h>
 #include "kstring.h"
 #include "hts_defs.h"
 
 /// Holds a filter variable.  This is also used to return the results.
 /**
- * Note we cope with zero-but-true in order to implement a basic
- * "exists(something)" check where "something" may even be zero.
+ * The expression language has 3-states of string, numeric, and unknown.
+ * The unknown state is either a NaN numeric or a null string, with both
+ * internally considered to have the same "unknown" meaning.
  *
- * Eg in the aux tag searching syntax, "[NM]" should return true if
- * NM tag exists even if zero.
- * Take care when negating this. "[NM] != 0" will be true when
- * [NM] is absent, thus consider "[NM] && [NM] != 0".
+ * These largely match the IEE 754 semantics for NaN comparisons: <, >, ==,
+ * != all fail, (even NaN == NaN).  Similarly arithmetic (+,-,/,*,%) with
+ * unknown values are still unknown (and false).
+ *
+ * The departure from NaN semantics though is that our unknown/null state is
+ * considered to be false while NaN in C is true.  Similarly the false nature
+ * of our unknown state meants !val becomes true, !!val is once again false,
+ * val && 1 is false, val || 0 is false, and val || 1 is true along with
+ * !val || 0 and !val && 1.
+ *
+ * Note it is possible for empty strings and zero numbers to also be true.
+ * An example of this is the aux string '[NM]' which returns true if the
+ * NM tag is found, regardless of whether it is also zero.  However the
+ * better approach added in 1.16 is 'exists([NM])'.
  */
 typedef struct hts_expr_val_t {
     char is_str;  // Use .s vs .d
@@ -45,6 +57,29 @@ typedef struct hts_expr_val_t {
     double d;     // otherwise this
 } hts_expr_val_t;
 
+/// Returns true if an hts_expr_val_t is defined.
+/* An example usage of this is in the SAM expression filter where an
+ * [X0] aux tag will be the value of X0 (string or numeric) if set, or
+ * a false nul-string (not the same as an empty one) when not set.
+ */
+static inline int hts_expr_val_exists(hts_expr_val_t *v) {
+    return v && !(v->is_str == 1 && v->s.s == NULL)
+             && !(v->is_str == 0 && isnan(v->d));
+}
+
+/// Returns true if an hts_expr_val_t is defined or is undef-but-true
+static inline int hts_expr_val_existsT(hts_expr_val_t *v) {
+    return (v && v->is_true) || hts_expr_val_exists(v);
+}
+
+/// Set a value to be undefined (nan).
+static inline void hts_expr_val_undef(hts_expr_val_t *v) {
+    ks_clear(&v->s);
+    v->is_true = 0;
+    v->is_str = 0;
+    v->d = NAN;
+}
+
 /// Frees a hts_expr_val_t type.
 static inline void hts_expr_val_free(hts_expr_val_t *f) {
     ks_free(&f->s);
diff --git a/test/test_expr.c b/test/test_expr.c
index 641e89041..ecd1232e4 100644
--- a/test/test_expr.c
+++ b/test/test_expr.c
@@ -62,16 +62,23 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) {
         *end = str+5;
         res->is_str = 1;
         kputs("", ks_clear(&res->s));
+    } else if (strncmp(str, "zero-but-true", 13) == 0) {
+        *end = str+13;
+        res->d = 0;
+        res->is_true = 1;
     } else if (strncmp(str, "null-but-true", 13) == 0) {
         *end = str+13;
+        hts_expr_val_undef(res);
         res->is_true = 1;
-        res->is_str = 1;
-        ks_clear(&res->s);
     } else if (strncmp(str, "null", 4) == 0) {
         // null string (eg aux:Z tag is absent)
         *end = str+4;
-        res->is_str = 1;
-        ks_clear(&res->s);
+        hts_expr_val_undef(res);
+    } else if (strncmp(str, "nan", 3) == 0) {
+        // sqrt(-1), 0/0 and similar
+        // Semantically the same operations as null.
+        *end = str+3;
+        hts_expr_val_undef(res);
 
     } else {
         return -1;
@@ -94,130 +101,200 @@ static inline int strcmpnull(const char *a, const char *b) {
     return strcmp(a, b);
 }
 
+// Compare NAN as equal, for testing we returned the correct values
+static inline int cmpfloat(double d1, double d2) {
+    // If needs be, can use DBL_EPSILON in comparisons here.
+    return d1 == d2 || (isnan(d1) && isnan(d2));
+}
+
 int test(void) {
     // These are all valid expressions that should work
     test_ev tests[] = {
-        { 1,  1, NULL, "1"},
-        { 1,  1, NULL, "+1"},
-        { 1, -1, NULL, "-1"},
-        { 0,  0, NULL, "!7"},
-        { 1,  1, NULL, "!0"},
-        { 1,  1, NULL, "!(!7)"},
-        { 1,  1, NULL, "!!7"},
-
-        { 1,  5, NULL, "2+3"},
-        { 1, -1, NULL, "2+-3"},
-        { 1,  6, NULL, "1+2+3"},
-        { 1,  1, NULL, "-2+3"},
-
-        { 1,  6, NULL, "2*3"},
-        { 1,  6, NULL, "1*2*3"},
-        { 0,  0, NULL, "2*0"},
-
-        { 1,  7, NULL, "(7)"},
-        { 1,  7, NULL, "((7))"},
-        { 1, 21, NULL, "(1+2)*(3+4)"},
-        { 1, 14, NULL, "(4*5)-(-2*-3)"},
-
-        { 1,  1, NULL, "(1+2)*3==9"},
-        { 1,  1, NULL, "(1+2)*3!=8"},
-        { 0,  0, NULL, "(1+2)*3!=9"},
-        { 0,  0, NULL, "(1+2)*3==8"},
-
-        { 0,  0, NULL, "1>2"},
-        { 1,  1, NULL, "1<2"},
-        { 0,  0, NULL, "3<3"},
-        { 0,  0, NULL, "3>3"},
-        { 1,  1, NULL, "9<=9"},
-        { 1,  1, NULL, "9>=9"},
-        { 1,  1, NULL, "2*4==8"},
-        { 1,  1, NULL, "16==0x10"},
-        { 1,  1, NULL, "15<0x10"},
-        { 1,  1, NULL, "17>0x10"},
-        { 0,  0, NULL, "2*4!=8"},
-        { 1,  1, NULL, "4+2<3+4"},
-        { 0,  0, NULL, "4*2<3+4"},
-        { 1,  8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4
-
-        { 1,  1, NULL, "(1<2) == (3>2)"},
-        { 1,  1, NULL, "1<2 == 3>2"},
-
-        { 1,  1, NULL, "2 && 1"},
-        { 0,  0, NULL, "2 && 0"},
-        { 0,  0, NULL, "0 && 2"},
-        { 1,  1, NULL, "2 || 1"},
-        { 1,  1, NULL, "2 || 0"},
-        { 1,  1, NULL, "0 || 2"},
-        { 1,  1, NULL, "1 || 2 && 3"},
-        { 1,  1, NULL, "2 && 3 || 1"},
-        { 1,  1, NULL, "0 && 3 || 2"},
-        { 0,  0, NULL, "0 && 3 || 0"},
-        { 0,  0, NULL, " 5 - 5 && 1"},
-        { 0,  0, NULL, "+5 - 5 && 1"},
-
-        { 1,  1, NULL, "3 & 1"},
-        { 1,  2, NULL, "3 & 2"},
-        { 1,  3, NULL, "1 | 2"},
-        { 1,  3, NULL, "1 | 3"},
-        { 1,  7, NULL, "1 | 6"},
-        { 1,  2, NULL, "1 ^ 3"},
-
-        { 1,  1, NULL, "(1^0)&(4^3)"},
-        { 1,  2, NULL, "1 ^(0&4)^ 3"},
-        { 1,  2, NULL, "1 ^ 0&4 ^ 3"},  // precedence, & before ^
-
-        { 1,  6, NULL, "(1|0)^(4|3)"},
-        { 1,  7, NULL, "1 |(0^4)| 3"},
-        { 1,  7, NULL, "1 | 0^4 | 3"},  // precedence, ^ before |
-
-        { 1,  1, NULL, "4 & 2 || 1"},
-        { 1,  1, NULL, "(4 & 2) || 1"},
-        { 0,  0, NULL, "4 & (2 || 1)"},
-        { 1,  1, NULL, "1 || 4 & 2"},
-        { 1,  1, NULL, "1 || (4 & 2)"},
-        { 0,  0, NULL, "(1 || 4) & 2"},
-
-        { 1,  1, NULL, " (2*3)&7  > 4"},
-        { 0,  0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv
-        { 1,  1, NULL, "((2*3)&7) > 4"},  // Python precedence equiv
-        { 1,  1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"},
-
-        { 1,  1, "plugh", "magic"},
-        { 1,  1, "",  "empty"},
-        { 1,  1, NULL, "magic == \"plugh\""},
-        { 1,  1, NULL, "magic != \"xyzzy\""},
-
-        { 1,  1, NULL, "\"abc\" < \"def\""},
-        { 1,  1, NULL, "\"abc\" <= \"abc\""},
-        { 0,  0, NULL, "\"abc\" < \"ab\""},
-        { 0,  0, NULL, "\"abc\" <= \"ab\""},
-
-        { 0,  0, NULL, "\"abc\" > \"def\""},
-        { 1,  1, NULL, "\"abc\" >= \"abc\""},
-        { 1,  1, NULL, "\"abc\" > \"ab\""},
-        { 1,  1, NULL, "\"abc\" >= \"ab\""},
-
-        { 1,  1, NULL, "\"abbc\" =~ \"^a+b+c+$\""},
-        { 0,  0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""},
-        { 1,  1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""},
-        { 1,  1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"},
-
-        { 1,  1, "",   "empty-but-true"   },
-        { 0,  0, NULL, "!empty-but-true"  },
-        { 1,  1, NULL, "!!empty-but-true" },
-        { 1,  1, NULL, "1 && empty-but-true && 1" },
-        { 0,  0, NULL, "1 && empty-but-true && 0" },
-
-        { 0,  0, NULL, "null"    },
-        { 1,  1, NULL, "!null"   },
-        { 0,  0, NULL, "!!null", },
-
-        { 1,  1, NULL, "null-but-true"   },
-        { 0,  0, NULL, "!null-but-true"  },
-        { 1,  1, NULL, "!!null-but-true" },
-
-        { 0,  0, NULL, "null || 0" },
-        { 1,  1, NULL, "null-but-true && 1" },
+        { 1,  1,   NULL, "1"},
+        { 1,  1,   NULL, "+1"},
+        { 1, -1,   NULL, "-1"},
+        { 0,  0,   NULL, "!7"},
+        { 1,  1,   NULL, "!0"},
+        { 1,  1,   NULL, "!(!7)"},
+        { 1,  1,   NULL, "!!7"},
+
+        { 1,  5,   NULL, "2+3"},
+        { 1, -1,   NULL, "2+-3"},
+        { 1,  6,   NULL, "1+2+3"},
+        { 1,  1,   NULL, "-2+3"},
+        { 0,  NAN, NULL, "1+null" },
+        { 0,  NAN, NULL, "null-1" },
+        { 0,  NAN, NULL, "-null" },
+
+        { 1,  6,   NULL, "2*3"},
+        { 1,  6,   NULL, "1*2*3"},
+        { 0,  0,   NULL, "2*0"},
+
+        { 1,  7,   NULL, "(7)"},
+        { 1,  7,   NULL, "((7))"},
+        { 1, 21,   NULL, "(1+2)*(3+4)"},
+        { 1, 14,   NULL, "(4*5)-(-2*-3)"},
+
+        { 0,  NAN, NULL, "2*null"},
+        { 0,  NAN, NULL, "null/2"},
+        { 0,  NAN, NULL, "0/0"},
+
+        { 1,  1,   NULL, "(1+2)*3==9"},
+        { 1,  1,   NULL, "(1+2)*3!=8"},
+        { 0,  0,   NULL, "(1+2)*3!=9"},
+        { 0,  0,   NULL, "(1+2)*3==8"},
+
+        { 0,  0,   NULL, "1>2"},
+        { 1,  1,   NULL, "1<2"},
+        { 0,  0,   NULL, "3<3"},
+        { 0,  0,   NULL, "3>3"},
+        { 1,  1,   NULL, "9<=9"},
+        { 1,  1,   NULL, "9>=9"},
+        { 1,  1,   NULL, "2*4==8"},
+        { 1,  1,   NULL, "16==0x10"},
+        { 1,  1,   NULL, "15<0x10"},
+        { 1,  1,   NULL, "17>0x10"},
+        { 0,  0,   NULL, "2*4!=8"},
+        { 1,  1,   NULL, "4+2<3+4"},
+        { 0,  0,   NULL, "4*2<3+4"},
+        { 1,  8,   NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4
+
+        { 1,  1,   NULL, "(1<2) == (3>2)"},
+        { 1,  1,   NULL, "1<2 == 3>2"},
+
+        { 0,  NAN, NULL, "null <= 0" },
+        { 0,  NAN, NULL, "null >= 0" },
+        { 0,  NAN, NULL, "null < 0" },
+        { 0,  NAN, NULL, "null > 0" },
+        { 0,  NAN, NULL, "null == null" },
+        { 0,  NAN, NULL, "null != null" },
+        { 0,  NAN, NULL, "null < 10" },
+        { 0,  NAN, NULL, "10 > null" },
+
+        { 1,  1,   NULL, "2 && 1"},
+        { 0,  0,   NULL, "2 && 0"},
+        { 0,  0,   NULL, "0 && 2"},
+        { 1,  1,   NULL, "2 || 1"},
+        { 1,  1,   NULL, "2 || 0"},
+        { 1,  1,   NULL, "0 || 2"},
+        { 1,  1,   NULL, "1 || 2 && 3"},
+        { 1,  1,   NULL, "2 && 3 || 1"},
+        { 1,  1,   NULL, "0 && 3 || 2"},
+        { 0,  0,   NULL, "0 && 3 || 0"},
+        { 0,  0,   NULL, " 5 - 5 && 1"},
+        { 0,  0,   NULL, "+5 - 5 && 1"},
+        { 0,  0,   NULL, "null && 1"}, // null && x == null
+        { 0,  0,   NULL, "1 && null"},
+        { 1,  1,   NULL, "!null && 1"},
+        { 1,  1,   NULL, "1 && !null"},
+        { 1,  1,   NULL, "1 && null-but-true"},
+        { 0,  0,   NULL, "null || 0"}, // null || 0 == null
+        { 0,  0,   NULL, "0 || null"},
+        { 1,  1,   NULL, "!null || 0"},
+        { 1,  1,   NULL, "0 || !null"},
+        { 1,  1,   NULL, "0 || null-but-true"},
+        { 1,  1,   NULL, "null || 1"}, // null || 1 == 1
+        { 1,  1,   NULL, "1 || null"},
+
+        { 1,  1,   NULL, "3 & 1"},
+        { 1,  2,   NULL, "3 & 2"},
+        { 1,  3,   NULL, "1 | 2"},
+        { 1,  3,   NULL, "1 | 3"},
+        { 1,  7,   NULL, "1 | 6"},
+        { 1,  2,   NULL, "1 ^ 3"},
+        { 0,  NAN, NULL, "1 | null"},
+        { 0,  NAN, NULL, "null | 1"},
+        { 0,  NAN, NULL, "1 & null"},
+        { 0,  NAN, NULL, "null & 1"},
+        { 0,  NAN, NULL, "0 ^ null"},
+        { 0,  NAN, NULL, "null ^ 0"},
+        { 0,  NAN, NULL, "1 ^ null"},
+        { 0,  NAN, NULL, "null ^ 1"},
+
+        { 1,  1,   NULL, "(1^0)&(4^3)"},
+        { 1,  2,   NULL, "1 ^(0&4)^ 3"},
+        { 1,  2,   NULL, "1 ^ 0&4 ^ 3"},  // precedence, & before ^
+
+        { 1,  6,   NULL, "(1|0)^(4|3)"},
+        { 1,  7,   NULL, "1 |(0^4)| 3"},
+        { 1,  7,   NULL, "1 | 0^4 | 3"},  // precedence, ^ before |
+
+        { 1,  1,   NULL, "4 & 2 || 1"},
+        { 1,  1,   NULL, "(4 & 2) || 1"},
+        { 0,  0,   NULL, "4 & (2 || 1)"},
+        { 1,  1,   NULL, "1 || 4 & 2"},
+        { 1,  1,   NULL, "1 || (4 & 2)"},
+        { 0,  0,   NULL, "(1 || 4) & 2"},
+
+        { 1,  1,   NULL, " (2*3)&7  > 4"},
+        { 0,  0,   NULL, " (2*3)&(7 > 4)"}, // C precedence equiv
+        { 1,  1,   NULL, "((2*3)&7) > 4"},  // Python precedence equiv
+        { 1,  1,   NULL, "((2*3)&7) > 4 && 2*2 <= 4"},
+
+        { 1,  1,   "plugh", "magic"},
+        { 1,  1,   "",   "empty"},
+        { 1,  1,   NULL, "magic == \"plugh\""},
+        { 1,  1,   NULL, "magic != \"xyzzy\""},
+
+        { 1,  1,   NULL, "\"abc\" < \"def\""},
+        { 1,  1,   NULL, "\"abc\" <= \"abc\""},
+        { 0,  0,   NULL, "\"abc\" < \"ab\""},
+        { 0,  0,   NULL, "\"abc\" <= \"ab\""},
+
+        { 0,  0,   NULL, "\"abc\" > \"def\""},
+        { 1,  1,   NULL, "\"abc\" >= \"abc\""},
+        { 1,  1,   NULL, "\"abc\" > \"ab\""},
+        { 1,  1,   NULL, "\"abc\" >= \"ab\""},
+
+        { 0,  NAN, NULL, "null == \"x\"" },
+        { 0,  NAN, NULL, "null != \"x\"" },
+        { 0,  NAN, NULL, "null < \"x\"" },
+        { 0,  NAN, NULL, "null > \"x\"" },
+
+        { 1,  1,   NULL, "\"abbc\" =~ \"^a+b+c+$\""},
+        { 0,  0,   NULL, "\"aBBc\" =~ \"^a+b+c+$\""},
+        { 1,  1,   NULL, "\"aBBc\" !~ \"^a+b+c+$\""},
+        { 1,  1,   NULL, "\"xyzzy plugh abracadabra\" =~ magic"},
+
+        { 1,  1,   "",   "empty-but-true"   },
+        { 0,  0,   NULL, "!empty-but-true"  },
+        { 1,  1,   NULL, "!!empty-but-true" },
+        { 1,  1,   NULL, "1 && empty-but-true && 1" },
+        { 0,  0,   NULL, "1 && empty-but-true && 0" },
+
+        { 0,  NAN, NULL, "null"    },
+        { 1,  1,   NULL, "!null"   },
+        { 0,  0,   NULL, "!!null", },
+        { 0,  0,   NULL, "!\"foo\""   },
+        { 1,  1,   NULL, "!!\"foo\""   },
+
+        { 1,  NAN, NULL, "null-but-true"   },
+        { 0,  0,   NULL, "!null-but-true"  },
+        { 1,  1,   NULL, "!!null-but-true" },
+        { 1,  0,   NULL, "zero-but-true"   },
+        { 0,  0,   NULL, "!zero-but-true"  },
+        { 1,  1,   NULL, "!!zero-but-true" },
+
+        { 1,  log(2), NULL, "log(2)"},
+        { 1,  exp(9), NULL, "exp(9)"},
+        { 1,  9,      NULL, "log(exp(9))"},
+        { 1,  8,      NULL, "pow(2,3)"},
+        { 1,  3,      NULL, "sqrt(9)"},
+        { 0,  NAN,    NULL, "sqrt(-9)"},
+
+        { 1,  2,      NULL, "default(2,3)"},
+        { 1,  3,      NULL, "default(null,3)"},
+        { 0,  0,      NULL, "default(null,0)"},
+        { 1,  NAN,    NULL, "default(null-but-true,0)"},
+        { 1,  NAN,    NULL, "default(null-but-true,null)"},
+        { 1,  NAN,    NULL, "default(null,null-but-true)"},
+
+        { 1,  1,      NULL, "exists(\"foo\")"},
+        { 1,  1,      NULL, "exists(12)"},
+        { 1,  1,      NULL, "exists(\"\")"},
+        { 1,  1,      NULL, "exists(0)"},
+        { 0,  0,      NULL, "exists(null)"},
+        { 1,  1,      NULL, "exists(null-but-true)"},
     };
 
     int i, res = 0;
@@ -234,15 +311,24 @@ int test(void) {
             continue;
         }
 
-        if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0
-                         || r.d != tests[i].dval
+        if (!hts_expr_val_exists(&r)) {
+            if (r.is_true != tests[i].truth_val ||
+                !cmpfloat(r.d, tests[i].dval)) {
+                fprintf(stderr,
+                        "Failed test: \"%s\" == \"%f\", got %s, \"%s\", %f\n",
+                        tests[i].str, tests[i].dval,
+                        r.is_true ? "true" : "false", r.s.s, r.d);
+                res = 1;
+            }
+        } else if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0
+                         || !cmpfloat(r.d, tests[i].dval)
                          || r.is_true != tests[i].truth_val)) {
             fprintf(stderr,
                     "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n",
                     tests[i].str, tests[i].sval,
                     r.is_true ? "true" : "false", r.s.s, r.d);
             res = 1;
-        } else if (!r.is_str && (r.d != tests[i].dval
+        } else if (!r.is_str && (!cmpfloat(r.d, tests[i].dval)
                                  || r.is_true != tests[i].truth_val)) {
             fprintf(stderr, "Failed test: %s == %f, got %s, %f\n",
                     tests[i].str, tests[i].dval,
@@ -264,6 +350,8 @@ int main(int argc, char **argv) {
         if (hts_filter_eval2(filt, NULL, lookup, &v))
             return 1;
 
+        printf("%s\t", v.is_true ? "true":"false");
+
         if (v.is_str)
             puts(v.s.s);
         else

From d77192ae04fe6a57a84b9050d1c0c976f207037d Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 5 Aug 2022 09:58:09 +0100
Subject: [PATCH 63/79] Update to htscodecs 1.3.0

 * Substantial reduction to name tokeniser memory.
 * Improve compiler SIMD support tests
 * Conditionally build SIMD object files based on configure results
 * Remove GNU make-ism
 * Merge Various speed improvements to rANS codecs (PR #53)
 * Fix compression of data blocks close to 2GB in size. (PR #55)
 * Rename NEWS to NEWS.md
 * Switch CI to rocklinux:9 instead of :latest
 * Preparation for v1.3.0 release
 * Fix big-endian rans_compress_O0_32x16() to match little-endian
 * Fix clash with overlapping buffers in 32x16 scalar O1 decoder.
 * Update README file to acknowledge rans32x16 options
 * Add a -b (benchmark) option to test/entropy
 * Add some simple entropy encoder benchmarks
 * Fix bug with RANS_ORDER_STRIPE on large blocks.
---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index 9cd552e17..3ef17f6fb 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 9cd552e173055730eb7701ebdbd13f6c579088e4
+Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8

From 759f69618b4fcd05d92e6cb46ca9d58e511c0225 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Wed, 10 Aug 2022 15:14:52 +0100
Subject: [PATCH 64/79] Recognise FASTA/Q even if the file has a very long
 header file

secondline_is_bases() avoids false positives by verifying that the
second line contains only base-encoded alphabetic characters, but if
the first line is longer than hts_detect_format2()'s peek buffer, we
fail to recognise FASTA or FASTQ.

Revise (and rename to is_fastaq()) so the FASTA/Q check is:
1. Check that the (buffered part of the) first line is entirely textual.
2. Return true if we have none of the second line in our peek buffer.
3. Check that the (buffered part of the) second line contains only
   base-encoding alphabetic characters.

Fixes samtools/samtools#1689.
---
 hts.c                   | 19 ++++++++++++-------
 test/fastq/fastq.tst    |  3 +++
 test/fastq/longline.fq  |  4 ++++
 test/fastq/longline.sam |  1 +
 4 files changed, 20 insertions(+), 7 deletions(-)
 create mode 100644 test/fastq/longline.fq
 create mode 100644 test/fastq/longline.sam

diff --git a/hts.c b/hts.c
index 03809a11b..19981d83f 100644
--- a/hts.c
+++ b/hts.c
@@ -417,12 +417,17 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim)
     return 1;
 }
 
-static int
-secondline_is_bases(const unsigned char *u, const unsigned char *ulim)
+static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
 {
-    // Skip to second line, returning false if there isn't one
-    u = memchr(u, '\n', ulim - u);
-    if (u == NULL || ++u == ulim) return 0;
+    const unsigned char *eol = memchr(u, '\n', ulim - u);
+
+    // Check that the first line is entirely textual
+    if (! is_text_only(u, eol? eol : ulim)) return 0;
+
+    // If the first line is very long, consider the file to indeed be FASTA/Q
+    if (eol == NULL) return 1;
+
+    u = eol+1; // Now points to the first character of the second line
 
     // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
     while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
@@ -678,12 +683,12 @@ int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
         fmt->format = hts_crypt4gh_format;
         return 0;
     }
-    else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) {
+    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
         fmt->category = sequence_data;
         fmt->format = fasta_format;
         return 0;
     }
-    else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) {
+    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
         fmt->category = sequence_data;
         fmt->format = fastq_format;
         return 0;
diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst
index 966f0ed8a..3b5fd9f4f 100644
--- a/test/fastq/fastq.tst
+++ b/test/fastq/fastq.tst
@@ -44,6 +44,9 @@ P minimal-q.sam $tview minimal.fa
 P multiline.sam $tview multiline.fq
 P multiline-q.sam $tview multiline.fa
 
+# FASTQ with a very long header line
+P longline.sam $tview -i fastq_aux longline.fq
+
 # Single file, unpaired data, with / without aux tags
 P single_noaux.sam $tview single.fq
 P single_noaux-q.sam $tview single.fa
diff --git a/test/fastq/longline.fq b/test/fastq/longline.fq
new file mode 100644
index 000000000..09cabd1a3
--- /dev/null
+++ b/test/fastq/longline.fq
@@ -0,0 +1,4 @@
+@readname	XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab
+ATGC
++
+qqqq
diff --git a/test/fastq/longline.sam b/test/fastq/longline.sam
new file mode 100644
index 000000000..4dc5e8215
--- /dev/null
+++ b/test/fastq/longline.sam
@@ -0,0 +1 @@
+readname	4	*	0	0	*	*	0	0	ATGC	qqqq	XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab

From a6ffbf50d55262595a094a46ea41dad3ab59d26d Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 10 Aug 2022 12:17:09 +0100
Subject: [PATCH 65/79] Fix SEGV when handling errors from sam_read1_sam
 without headers

---
 sam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sam.c b/sam.c
index 3d479e1ed..c95d1c693 100644
--- a/sam.c
+++ b/sam.c
@@ -4101,7 +4101,7 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
         fp->line.l = 0;
         if (ret < 0) {
             hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
-            if (h->ignore_sam_err) goto err_recover;
+            if (h && h->ignore_sam_err) goto err_recover;
         }
     }
 

From 6a043b9e10d6fee9ef5c0a82e3716155b91e1fef Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Tue, 5 Jul 2022 17:09:04 +0100
Subject: [PATCH 66/79] Extend VCF API to distinguish between INS and DEL
 variant types

The change is largerly backward API and ABI compatible unless
the var_type flag is queried for equality in the user program.
API alternatives for querying these flags is provided.
---
 htslib/vcf.h                     | 32 ++++++++++++++++++++++++--------
 test/test-bcf_set_variant_type.c |  6 +++---
 vcf.c                            | 32 ++++++++++++++++++++++++++++----
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/htslib/vcf.h b/htslib/vcf.h
index 8f7d79fe3..b8162d7a2 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -138,13 +138,16 @@ extern uint8_t bcf_type_shift[];
 #define BCF_BT_FLOAT    5
 #define BCF_BT_CHAR     7
 
-#define VCF_REF      0
-#define VCF_SNP      1
-#define VCF_MNP      2
-#define VCF_INDEL    4
-#define VCF_OTHER    8
-#define VCF_BND     16    // breakend
-#define VCF_OVERLAP 32    // overlapping deletion, ALT=*
+#define VCF_REF         0
+#define VCF_SNP     (1<<0)
+#define VCF_MNP     (1<<1)
+#define VCF_INDEL   (1<<2)
+#define VCF_OTHER   (1<<3)
+#define VCF_BND     (1<<4)      // breakend
+#define VCF_OVERLAP (1<<5)      // overlapping deletion, ALT=*
+#define VCF_INS     (1<<6)      // implies VCF_INDEL
+#define VCF_DEL     (1<<7)      // implies VCF_INDEL
+#define VCF_ANY     (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL)       // any variant type (but not VCF_REF)
 
 typedef struct bcf_variant_t {
     int type, n;    // variant type and the number of bases affected, negative for deletions
@@ -751,7 +754,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
 
     /**
-     *  bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc
+     *  bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc. (DEPRECATED)
+     *  bcf_has_variant_type[s]()  - the preferred way to query the presence of variant types
+     *  @bitmask:   combination of VCF_* variant type above, VCF_INDEL implies VCF_INS|VCF_DEL
+     *  @mode:      `exact` for an exact match, `overlap` for at least one matching variant,
+     *              `subset` for the listed variants only
      */
     HTSLIB_EXPORT
     int bcf_get_variant_types(bcf1_t *rec);
@@ -759,6 +766,15 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     HTSLIB_EXPORT
     int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
 
+    enum bcf_variant_match { exact, overlap, subset };
+
+    HTSLIB_EXPORT
+    int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode);
+
+    HTSLIB_EXPORT
+    int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode);
+
+
     HTSLIB_EXPORT
     int bcf_is_snp(bcf1_t *v);
 
diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c
index fef212dbb..e5092084e 100644
--- a/test/test-bcf_set_variant_type.c
+++ b/test/test-bcf_set_variant_type.c
@@ -46,19 +46,19 @@ static void test_bcf_set_variant_type()
     bcf_set_variant_type("A", "T", &var1);
     if ( var1.type != VCF_SNP)
     {
-        error("A -> T was not detected as a breakend");
+        error("A -> T was not detected as a SNP");
     }
 
     // Test INDEL
     bcf_variant_t var2a;
     bcf_set_variant_type("A", "AA", &var2a);
-    if ( var2a.type != VCF_INDEL)
+    if ( var2a.type != (VCF_INDEL|VCF_INS) )
     {
         error("A -> AA was not detected as an INDEL");
     }
     bcf_variant_t var2b;
     bcf_set_variant_type("AA", "A", &var2b);
-    if ( var2b.type != VCF_INDEL)
+    if ( var2b.type != (VCF_INDEL|VCF_DEL) )
     {
         error("AA -> A was not detected as a INDEL");
     }
diff --git a/vcf.c b/vcf.c
index aa7c558be..2edf68ddd 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4196,12 +4196,12 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t
     {
         if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
         while ( *a ) a++;
-        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
+        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
     }
     else if ( *r && !*a )
     {
         while ( *r ) r++;
-        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
+        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
     }
     else if ( !*r && !*a )
     {
@@ -4216,13 +4216,13 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t
     {
         if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
         var->n = -(re-r);
-        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
+        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
         var->type = VCF_OTHER; return;
     }
     else if ( re==r )
     {
         var->n = ae-a;
-        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
+        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
         var->type = VCF_OTHER; return;
     }
 
@@ -4264,6 +4264,30 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
     if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
     return rec->d.var[ith_allele].type;
 }
+inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_match mode)
+{
+    if ( mode==overlap ) return type & bitmask;
+
+    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
+    // ask for say `VCF_INS` or `VCF_INDEL` only
+    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
+    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
+
+    if ( mode==subset )
+    {
+        if ( ~bitmask & type ) return 0;
+        else return bitmask & type;
+    }
+    return type==bitmask ? type : 0;
+}
+int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode)
+{
+    return _has_variant_type(bcf_get_variant_type(rec, ith_allele), bitmask, mode);
+}
+int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode)
+{
+    return _has_variant_type(bcf_get_variant_types(rec), bitmask, mode);
+}
 
 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
 {

From 552219e856749acc31900e3b7a0db74e15f98d28 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Wed, 6 Jul 2022 13:03:01 +0100
Subject: [PATCH 67/79] Make values returned by bcf_get_variant_type[s]
 backward compatible

When the old bcf_get_variant_type[s] functions are used, the values
stored and returned are identical to the old interface.
---
 vcf.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vcf.c b/vcf.c
index 2edf68ddd..5a1aa2758 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4257,14 +4257,14 @@ static int bcf_set_variant_types(bcf1_t *b)
 int bcf_get_variant_types(bcf1_t *rec)
 {
     if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return rec->d.var_type;
+    return rec->d.var_type & ~(VCF_INS|VCF_DEL);
 }
 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
 {
     if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return rec->d.var[ith_allele].type;
+    return rec->d.var[ith_allele].type & ~(VCF_INS|VCF_DEL);
 }
-inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_match mode)
+inline static int has_variant_type(int type, int bitmask, enum bcf_variant_match mode)
 {
     if ( mode==overlap ) return type & bitmask;
 
@@ -4282,11 +4282,13 @@ inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_matc
 }
 int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode)
 {
-    return _has_variant_type(bcf_get_variant_type(rec, ith_allele), bitmask, mode);
+    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
+    return has_variant_type(rec->d.var[ith_allele].type, bitmask, mode);
 }
 int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode)
 {
-    return _has_variant_type(bcf_get_variant_types(rec), bitmask, mode);
+    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
+    return has_variant_type(rec->d.var_type, bitmask, mode);
 }
 
 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)

From 8d9193899ef3e34ea8ecc9f3fb72063e74697f69 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 2 Aug 2022 18:19:55 +0100
Subject: [PATCH 68/79] Adjust new variant type interfaces

* Remove `mode` from bcf_has_variant_type() interface, and add
  a special case for `VCF_REF`

Individual alleles only have a single variant type, so the only
useful mode is the overlap one (bitwise-and).  The exception
is VCF_REF, which is encoded as 0, so has to be tested for by
equality.

* Put `bcf_match_` prefix on enumerated values, to avoid name
  clashes

* Make `bitmask` unsigned for more predictable bitwise operations
  (the return value still has to be signed, though).

* Return -1 if bcf_set_variant_types() fails, of if the requested
  allele is not valid.

As callers using the legacy API won't be checking for a -1 return,
these unfortunately need to be made to call exit(1) on failure.
This is however an improvement on what would have happened under
the same conditions before, which would most likely have been a
NULL pointer dereference.

* Add a bcf_variant_length() function, to more easily access the
  rec->d.var[].n field.

* Be more specific on specifying the mask used to restrict the
  types the old functions return, in case more are added later.

* Improve documentation in the header.
---
 htslib/vcf.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++-----
 vcf.c        | 78 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 142 insertions(+), 26 deletions(-)

diff --git a/htslib/vcf.h b/htslib/vcf.h
index b8162d7a2..d3ef6560f 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -753,27 +753,101 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     HTSLIB_EXPORT
     int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
 
+    /// Get variant types in a BCF record
     /**
-     *  bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc. (DEPRECATED)
-     *  bcf_has_variant_type[s]()  - the preferred way to query the presence of variant types
-     *  @bitmask:   combination of VCF_* variant type above, VCF_INDEL implies VCF_INS|VCF_DEL
-     *  @mode:      `exact` for an exact match, `overlap` for at least one matching variant,
-     *              `subset` for the listed variants only
+     *  @param rec   BCF/VCF record
+     *  @return Types of variant present
+     *
+     *  The return value will be a bitwise-or of VCF_SNP, VCF_MNP,
+     *  VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP.  If will return
+     *  VCF_REF (i.e. 0) if none of the other types is present.
+     *  @deprecated Please use bcf_has_variant_types() instead
      */
     HTSLIB_EXPORT
     int bcf_get_variant_types(bcf1_t *rec);
 
+    /// Get variant type in a BCF record, for a given allele
+    /**
+     *  @param  rec        BCF/VCF record
+     *  @param  ith_allele Allele to check
+     *  @return Type of variant present
+     *
+     *  The return value will be one of VCF_REF, VCF_SNP, VCF_MNP,
+     *  VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP.
+     *  @deprecated Please use bcf_has_variant_type() instead
+     */
     HTSLIB_EXPORT
     int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
 
-    enum bcf_variant_match { exact, overlap, subset };
+    /// Match mode for bcf_has_variant_types()
+    enum bcf_variant_match {
+        bcf_match_exact,   ///< Types present exactly match tested for
+        bcf_match_overlap, ///< At least one variant type in common
+        bcf_match_subset,  ///< Test set is a subset of types present
+    };
 
+    /// Check for presence of variant types in a BCF record
+    /**
+     *  @param rec      BCF/VCF record
+     *  @param bitmask  Set of variant types to test for
+     *  @param mode     Match mode
+     *  @return >0 if the variant types are present,
+     *           0 if not present,
+     *          -1 on error
+     *
+     *  @p bitmask should be the bitwise-or of the variant types (VCF_SNP,
+     *     VCF_MNP, etc.) to test for.
+     *
+     *  The return value is the bitwise-and of the set of types present
+     *  and @p bitmask.  Callers that want to check for the presence of more
+     *  than one type can avoid function call overhead by passing all the
+     *  types to be checked for in a single call to this function, in
+     *  bcf_match_overlap mode, and then check for them individually in the
+     *  returned value.
+     *
+     *  As VCF_REF is represented by 0 (i.e. the absence of other variants)
+     *  it should be tested for using
+     *    bcf_has_variant_types(rec, VCF_REF, bcf_match_exact)
+     *  which will return 1 if no other variant type is present, otherwise 0.
+     */
     HTSLIB_EXPORT
-    int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode);
+    int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, enum bcf_variant_match mode);
 
+    /// Check for presence of variant types in a BCF record, for a given allele
+    /**
+     *  @param rec         BCF/VCF record
+     *  @param ith_allele  Allele to check
+     *  @param bitmask     Set of variant types to test for
+     *  @return >0 if one of the variant types is present,
+     *           0 if not present,
+     *          -1 on error
+     *
+     *  @p bitmask should be the bitwise-or of the variant types (VCF_SNP,
+     *     VCF_MNP, etc.) to test for, or VCF_REF on its own.
+     *
+     *  The return value is the bitwise-and of the set of types present
+     *  and @p bitmask.  Callers that want to check for the presence of more
+     *  than one type can avoid function call overhead by passing all the
+     *  types to be checked for in a single call to this function, and then
+     *  check for them individually in the returned value.
+     *
+     *  As a special case, if @p bitmask is VCF_REF (i.e. 0), the function
+     *  tests for an exact match.  The return value will be 1 if the
+     *  variant type calculated for the allele is VCF_REF, otherwise if
+     *  any other type is present it will be 0.
+     */
     HTSLIB_EXPORT
-    int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode);
+    int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask);
 
+    /// Return the number of bases affected by a variant, for a given allele
+    /**
+     *  @param rec         BCF/VCF record
+     *  @param ith_allele  Allele index
+     *  @return The number of bases affected (negative for deletions),
+     *          or bcf_int32_missing on error.
+     */
+    HTSLIB_EXPORT
+    int bcf_variant_length(bcf1_t *rec, int ith_allele);
 
     HTSLIB_EXPORT
     int bcf_is_snp(bcf1_t *v);
diff --git a/vcf.c b/vcf.c
index 5a1aa2758..56af63054 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4238,7 +4238,10 @@ static int bcf_set_variant_types(bcf1_t *b)
     bcf_dec_t *d = &b->d;
     if ( d->n_var < b->n_allele )
     {
-        d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
+        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
+        if (!new_var)
+            return -1;
+        d->var = new_var;
         d->n_var = b->n_allele;
     }
     int i;
@@ -4254,42 +4257,81 @@ static int bcf_set_variant_types(bcf1_t *b)
     return 0;
 }
 
+// bcf_get_variant_type/bcf_get_variant_types should only return the following,
+// to be compatible with callers that are not expecting newer values
+// like VCF_INS, VCF_DEL.  The full set is available from the newer
+// vcf_has_variant_type* interfaces.
+#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
 int bcf_get_variant_types(bcf1_t *rec)
 {
-    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return rec->d.var_type & ~(VCF_INS|VCF_DEL);
+    if ( rec->d.var_type==-1 ) {
+        if (bcf_set_variant_types(rec) != 0) {
+            hts_log_error("Couldn't get variant types: %s", strerror(errno));
+            exit(1); // Due to legacy API having no way to report failures
+        }
+    }
+    return rec->d.var_type & ORIG_VAR_TYPES;
 }
+
 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
 {
-    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return rec->d.var[ith_allele].type & ~(VCF_INS|VCF_DEL);
+    if ( rec->d.var_type==-1 ) {
+        if (bcf_set_variant_types(rec) != 0) {
+            hts_log_error("Couldn't get variant types: %s", strerror(errno));
+            exit(1); // Due to legacy API having no way to report failures
+        }
+    }
+    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
+        hts_log_error("Requested allele outside valid range");
+        exit(1);
+    }
+    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
 }
-inline static int has_variant_type(int type, int bitmask, enum bcf_variant_match mode)
+#undef ORIG_VAR_TYPES
+
+int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
 {
-    if ( mode==overlap ) return type & bitmask;
+    if ( rec->d.var_type==-1 ) {
+        if (bcf_set_variant_types(rec) != 0) return -1;
+    }
+    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
+    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
+        return rec->d.var[ith_allele].type == VCF_REF;
+    }
+    return bitmask & rec->d.var[ith_allele].type;
+}
+
+int bcf_variant_length(bcf1_t *rec, int ith_allele)
+{
+    if ( rec->d.var_type==-1 ) {
+        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
+    }
+    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
+    return rec->d.var[ith_allele].n;
+}
+
+int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
+                          enum bcf_variant_match mode)
+{
+    if ( rec->d.var_type==-1 ) {
+        if (bcf_set_variant_types(rec) != 0) return -1;
+    }
+    uint32_t type = rec->d.var_type;
+    if ( mode==bcf_match_overlap ) return bitmask & type;
 
     // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
     // ask for say `VCF_INS` or `VCF_INDEL` only
     if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
     else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
 
-    if ( mode==subset )
+    if ( mode==bcf_match_subset )
     {
         if ( ~bitmask & type ) return 0;
         else return bitmask & type;
     }
+    // mode == bcf_match_exact
     return type==bitmask ? type : 0;
 }
-int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode)
-{
-    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return has_variant_type(rec->d.var[ith_allele].type, bitmask, mode);
-}
-int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode)
-{
-    if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
-    return has_variant_type(rec->d.var_type, bitmask, mode);
-}
 
 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
 {

From 2a646be681f892cebf725a471dcba222db86a43b Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 12 Aug 2022 12:26:20 +0100
Subject: [PATCH 69/79] Add trivial le_to_u8() endianness routine

This is even more trivial than le_to_i8() but can be useful for the
regular treatment of unsigned types in #define BRANCH type switches.
---
 htslib/hts_endian.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/htslib/hts_endian.h b/htslib/hts_endian.h
index 790d2d5c6..30ad8055d 100644
--- a/htslib/hts_endian.h
+++ b/htslib/hts_endian.h
@@ -113,6 +113,14 @@ typedef uint64_t uint64_u;
 #    endif
 #endif
 
+/// Get a uint8_t value from an unsigned byte array
+/** @param buf Pointer to source byte, may be unaligned
+ *  @return An 8-bit unsigned integer
+ */
+static inline uint8_t le_to_u8(const uint8_t *buf) {
+    return *buf;
+}
+
 /// Get a uint16_t value from an unsigned byte array
 /** @param buf Pointer to source byte, may be unaligned
  *  @return A 16 bit unsigned integer

From 8f140eed115bdb18257174420779757546e91c6c Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <whitwham@users.noreply.github.com>
Date: Fri, 12 Aug 2022 15:13:38 +0100
Subject: [PATCH 70/79] NEWS additions for Summer 2022 Release. (PR #1465)

Co-authored-by: Rob Davies <rmd+git@sanger.ac.uk>
---
 NEWS | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 158 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 53a0d3c34..fd49755f4 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,156 @@
 Noteworthy changes in release a.b
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+* Make hfile_s3 refresh AWS credentials on expiry in order to make HTSlib work
+  better with AWS IAM credentials, which have a limited lifespan.
+  (PR#1462 and PR#1474, addresses #344)
+
+* Allow BAM headers between 2GB and 4GB in size once more.  This is not
+  permitted in the BAM specification but was allowed in an earlier version of
+  HTSlib.  There is now a warning at 2GB and a hard failure at 4GB.
+  (PR#1421, fixes #1420 and samtools#1613. Reported by John Marshall and
+  R C Mueller)
+
+* Improve error message when failing to load an index.
+  (PR#1468, example of the problem samtools#1637)
+
+* Permit MM (base modification) tags containing "." and "?" suffixes.  These
+  define implicit vs explicit coordinates.  See the SAM tags specification for
+  details.
+  (PR#1423 and PR#1426, fixes #1418.  PR#1469, fixes #1466.  Reported
+  by cjw85)
+
+* Warn if spaces instead of tabs are detected in a VCF file to prevent
+  confusion.
+  (PR#1328, fixes bcftools#1575.  Reported by ketkijoshi278)
+
+* Add an "sclen" filter expression keyword.  This is the length of a soft-clip,
+  both left and right end.  It may be combined with qlen (qlen-sclen) to obtain
+  the number of bases in the query sequence that have been aligned to the genome
+  ie it provides a way to compare local-alignment vs global-alignment length.
+  (PR#1441 and PR/samtools#1661, fixes #1436. Requested by Chang Y)
+
+* Improve error messages for CRAM reference mismatches.  If the user specifies
+  the wrong reference, the CRAM slice header MD5sum checks fail.  We now report
+  the SQ line M5 string too so it is possible to validate against the whole
+  chr in the ref.fa file.  The error message has also been improved to report
+  the reference name instead of #num.  Finally, we now hint at the likely cause,
+  which counters the misleading samtools supplied error of "truncated or
+  corrupt" file.
+  (PR#1427, fixes samtools#1640.  Reported by Jian-Guo Zhou)
+
+* Expose more of the CRAM API and add new functionality to extract the reference
+  from a CRAM file.
+  (PR#1429 and PR#1442)
+
+* Improvements to the implementation of embedded references in CRAM where no
+  external reference is specified.
+  (PR#1449, addresses some of the issues in #1445)
+
+* The CRAM writer now allows alignment records with RG:Z: aux tags that
+  don't have a corresponding @RG ID in the file header.  Previously these
+  tags would have been silently dropped.  HTSlib will complain whenever it
+  has to add one though, as such tags do not conform to recommended practice
+  for the SAM, BAM and CRAM formats.
+  (PR#1480, fixes #1479.  Reported by Alex Leonard)
+
+* Set tab delimiter in man page for tabix GFF3 sort.
+  (PR#1457.  Thanks to Colin Diesh)
+
+* When using libdeflate, the 1...9 scale of BGZF compression levels is
+  now remapped to the 1...12 range used by libdeflate instead of being
+  passed directly.  In particular, HTSlib levels 8 and 9 now map to
+  libdeflate levels 10 and 12, so it is possible to select the highest (but
+  slowest) compression offered by libdeflate.
+  (PR#1488, fixes #1477.  Reported by Gert Hulselmans)
+
+* The VCF variant API has been extended so that it can return separate flags
+  for INS and DEL variants as well as the existing INDEL one.  These flags
+  have not been added to the old bcf_get_variant_types() interface as
+  it could break existing users.  To access them, it is necessary to use new
+  functions bcf_has_variant_type() and bcf_has_variant_types().
+  (PR#1467)
+
+* The missing, but trivial, `le_to_u8()` function has been added to hts_endian.
+  (PR#1494, Thanks to John Marshall)
+
+Build changes
+-------------
+
+These are compiler, configuration and makefile based changes.
+
+* Update htscodecs to version 1.3.0 for new SIMD code + various fixes.
+  Updates the htscodecs submodule and adds changes necessary to make HTSlib
+  build the new SIMD codec implementations.
+  (PR#1438, PR#1489)
+
+* Fix clang builds under mingw.  Under mingw, clang requires dllexport to be
+  applied to both function declarations and function definitions.
+  (PR#1435, fixes #1433.  Reported by teepean)
+
+* Fix curl type warning with gcc 12.1 on Windows.
+  (PR#1443)
+
+* Detect ARM Neon support and only build appropriate SIMD object files.
+  (PR#1451, fixes #1450.  Thanks to John Marshall)
+
+* `make print-config` now reports extra CFLAGS that are needed to build the
+  SIMD parts of htscodecs.  These may be of use to third-party build
+  systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485.
+  Thanks to John Marshall)
+
+Bug fixes
+---------
+
+* Fix bug when reading position -1 in BCF (0 in VCF), which is used to indicate
+  telomeric  regions.  The BCF reader was incorrectly assuming the value stored
+  in the file was unsigned, so a VCF->BCF->VCF round-trip would change it
+  from 0 to 4294967296.
+  (PR#1476, fixes #1475 and bcftools#1753.  Reported by Rodrigo Martin)
+
+* Various bugs and quirks have been fixed in the filter expression engine,
+  mostly related to the handling of absent tags, and the is_true flag.
+  Note that as a result of these fixes, some filter expressions may give
+  different results:
+  - Fixed and-expressions including aux tag values which could give an invalid
+    true result depending on the order of terms.
+  - The expression `![NM]` is now true if only `NM` does not exist.  In
+    earlier versions it would also report true for tags like `NM:i:0` which
+    exist but have a value of zero.
+  - The expression `[X1] != 0` is now false when `X1` does not exist.  Earlier
+    versions would return true for this comparison when the tag was missing.
+  - NULL values due to missing tags now propagate through string, bitwise
+    and mathematical operations.  Logical operations always treat them as
+    false.
+  (PR#1463, fixes samtools#1670.  Reported by Gert Hulselmans;
+   PR#1478, fixes samtools#1677.  Reported by johnsonzcode)
+
+* Fix buffer overrun in bam_plp_insertion_mod.  Memory now grows to the proper
+  size needed for base modification data.
+  (PR#1430, fixes samtools#1652.  Reported by hd2326)
+
+* Remove limit of returned size from fai_retrieve().
+  (PR#1446, fixes samtools#1660.  Reported by Shane McCarthy)
+
+* Cap hts_getline() return value at INT_MAX.  Prevents hts_getline() from
+  returning a negative number (a fail) for very long string length values.
+  (PR#1448.  Thanks to John Marshall)
+
+* Fix breakend detection and test bcf_set_variant_type().
+  (PR#1456, fixes #1455.  Thanks to Martin Pollard)
+
+* Prevent arrays of BCF_BT_NULL values found in BCF files from causing
+  bcf_fmt_array() to call exit() as the type is unsupported.  These are
+  now tested for and caught by bcf_record_check(), which returns an
+  error code instead.  (PR#1486)
+
+* Improved detection of fasta and fastq files that have very long comments
+  following identifiers.  (PR#1491, thanks to John Marshall.
+  Fixes samtools/samtools#1689, reported by cjw85)
+
+* Fixed a SEGV triggered by giving a SAM file to `samtools import`.
+  (PR#1492)
+
 Noteworthy changes in release 1.15.1 (7th April 2022)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -435,7 +585,7 @@ These are compiler, configuration and makefile based changes.
   compiler flags. Thanks to John Marshall. (#1187)
 
 * Added 'fall through' comments to prevent warnings issued by Clang on
-  intentional fall through case statements, when building with 
+  intentional fall through case statements, when building with
   `-Wextra flag`. Thanks to John Marshall. (#1163)
 
 * Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work
@@ -459,7 +609,7 @@ Bug fixes
   CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196)
 
 * Fixed a tabix bug that prevented setting the correct number of lines to be
-  skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; 
+  skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189;
   fixed #1186)
 
 * Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing
@@ -1379,7 +1529,7 @@ Noteworthy changes in release 1.8 (3rd April 2018)
 Noteworthy changes in release 1.7 (26th January 2018)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* BAM: HTSlib now supports BAMs which include CIGARs with more than 
+* BAM: HTSlib now supports BAMs which include CIGARs with more than
   65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8).
 
 * BCF/VCF:
@@ -1397,13 +1547,13 @@ Noteworthy changes in release 1.7 (26th January 2018)
     (#651, #653; reported by Imran Haque and @egafni via pysam).
 
 * Multi-region iterator: The new structure takes a list of regions and
-  iterates over all, deduplicating reads in the process, and producing a 
-  full list of file offset intervals. This is usually much faster than 
+  iterates over all, deduplicating reads in the process, and producing a
+  full list of file offset intervals. This is usually much faster than
   repeatedly using the old single-region iterator on a series of regions.
 
 * Curl improvements:
   - Add Bearer token support via HTS_AUTH_LOCATION env (#600).
-  - Use CURL_CA_BUNDLE environment variable to override the CA (#622; 
+  - Use CURL_CA_BUNDLE environment variable to override the CA (#622;
     thanks to Garret Kelly & David Alexander).
   - Speed up (removal of excessive waiting) for both http(s) and ftp.
   - Avoid repeatedly reconnecting by removal of unnecessary seeks.
@@ -1412,7 +1562,7 @@ Noteworthy changes in release 1.7 (26th January 2018)
 * BGZF block caching, if enabled, now performs far better (#629; reported
   by Ram Yalamanchili).
 
-* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas 
+* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas
   Hickman).
 
 * Tidied up the drand48 support (intended for systems that do not
@@ -1510,7 +1660,7 @@ Release 1.4 (13 March 2017)
 
 * HTSlib now links against libbz2 and liblzma by default.  To remove these
   dependencies, run configure with options --disable-bz2 and --disable-lzma,
-  but note that this may make some CRAM files produced elsewhere unreadable. 
+  but note that this may make some CRAM files produced elsewhere unreadable.
 
 * Added a thread pool interface and replaced the bgzf multi-threading
   code to use this pool.  BAM and CRAM decoding is now multi-threaded

From b5cc0b76de4511690c938ad6a04d3cbf03bb3ab3 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 15 Aug 2022 16:17:17 +0100
Subject: [PATCH 71/79] Fix clang builds on Windows/mingw.

If we use HTSLIB_EXPORT once, it has to be used everywhere for that symbol.

Also see #1435
---
 htslib/ksort.h | 2 ++
 htslib_vars.mk | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/htslib/ksort.h b/htslib/ksort.h
index ad19fc47a..fe2933bd5 100644
--- a/htslib/ksort.h
+++ b/htslib/ksort.h
@@ -64,6 +64,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "hts.h"
 
 #ifndef klib_unused
 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
@@ -81,6 +82,7 @@ extern "C" {
 // problems on Windows.  Don't include htslib/hts_os.h for this as it
 // may not get on with older attempts to fix this in code that includes
 // this file.
+HTSLIB_EXPORT
 extern double hts_drand48(void);
 
 typedef struct {
diff --git a/htslib_vars.mk b/htslib_vars.mk
index 1f4c0905a..2a7b29450 100644
--- a/htslib_vars.mk
+++ b/htslib_vars.mk
@@ -42,7 +42,7 @@ htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h)
 htslib_klist_h = $(HTSPREFIX)htslib/klist.h
 htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h
 htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h
-htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h
+htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_h)
 htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h)
 htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h)
 htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h)

From ecc3d0f419aaf9aa7009af60685445878ff1e357 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sat, 13 Aug 2022 22:16:21 +0100
Subject: [PATCH 72/79] Implement endianness conversion in bcf_format_gt()

Add a convert() parameter to htslib/vcf.h's BRANCH-style macro, similar
to those previously added to all the BRANCH-style macros in vcf.c. Fixes
the VCF printing of records with more alleles than fits in BCF_BT_INT8.

Add a record with GT values >256 to test_bcf2vcf's VCF file, and regenerate
the corresponding BCF file via

  test/test_view -b -l0 -p test/tabix/vcf_file.bcf test/tabix/vcf_file.vcf
---
 htslib/vcf.h            |  20 +++++++++++---------
 test/tabix/vcf_file.bcf | Bin 2719 -> 4485 bytes
 test/tabix/vcf_file.vcf |   1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/htslib/vcf.h b/htslib/vcf.h
index d3ef6560f..c94bea589 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -1460,21 +1460,23 @@ static inline int bcf_float_is_vector_end(float f)
 static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
 {
     uint32_t e = 0;
-    #define BRANCH(type_t, missing, vector_end) { \
-        type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \
+    #define BRANCH(type_t, convert, missing, vector_end) { \
+        uint8_t *ptr = fmt->p + isample*fmt->size; \
         int i; \
-        for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \
+        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
         { \
-            if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \
-            if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \
-            else e |= kputw((ptr[i]>>1) - 1, str) < 0; \
+            type_t val = convert(ptr); \
+            if ( val == vector_end ) break; \
+            if ( i ) e |= kputc("/|"[val&1], str) < 0; \
+            if ( !(val>>1) ) e |= kputc('.', str) < 0; \
+            else e |= kputw((val>>1) - 1, str) < 0; \
         } \
         if (i == 0) e |= kputc('.', str) < 0; \
     }
     switch (fmt->type) {
-        case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing, bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
         case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
         default: hts_log_error("Unexpected type %d", fmt->type); return -2;
     }
diff --git a/test/tabix/vcf_file.bcf b/test/tabix/vcf_file.bcf
index 75a64b38c7f62b59c2d8c1009f4cbdff6bda69c7..a4aafec47726dbcc1c335c27fe5810bf13e33923 100644
GIT binary patch
delta 1828
zcmXAqF^UyI6h-@g2bGzq1Nmlx1`{iSaoa(Gae-iL;4V6GFfeohuE9WX4~Fi*1-J!0
z=Qz-}tIn%__t$&XJzpQ)J-a;l{P1e=@9+KP^?qJHtfx2Y_dnk*@2`(v_m|~{&^@k?
zKb|iq*DWt!pO+V>^Rk>CT=vU$nU`_9Z@2rr-5Kn!_HE;uT!V}GWImZs=9BqgK9~>Y
zgZa*UXTCE>cz#pnp$<hjg>x#hDI2mO8?qtuw&8`yOgK$Ac@BY1^G)+j^KmIM^G)-C
z#(pqhXu@eimME}c!eK(5P-G??CM1C(GvP2H`4rjz-zobKxVujTMo^K>Luf*O4k9$6
z28hfgK7+_i$lqJ?ZCg6ik;Z3y#%FxSM|{Lbd>jS#0(*hIKnB#5q!cxkQ&u$t3fnlI
z0kxH6z;+t9zWmfzQhqATm=Yh|46y68Kn4_A(hexK;|wTv<JOOO1~g8d0X1SwjZ!(0
zG+ut{{`Me{0i`#b0mX0J0~t_%Nd|PFG3zN19A&)+FpX<K;o|0J95+9QC~k!gPn1Xl
zw15VT<+T8nvowuoPzyZZ@(gMLp!FJfw$$?10$+oO{GbQiIOnMWPP&>wE%0rL#M?nF
z@CA}*z>#;{25Nz46;caO^Vb42UVippAcI=Kk26s%KsDg}s~OY+-_Hn~-_LrKJSZHE
zYrwwZ=4Tu?KZh+ojshC6?E>z=mpzr&0#sfL(0B&5z)ynF3~B)-YrPxrRaVPi3;b|I
zcFsSCTH{*a$E2|j;FFrD7NGHVPz(Iz7|oy-Fl>DWwE%3MT7bH~7O-&R{B!dGdY5Xz
z{?m8{FiAV81&mv7kNpy*tY-{I<8A=oxcPnkRr8~8-aie{0vhn+dO!o;IkXm_@>+n#
xGpGd~bfX#60?+WetT*e`dcB_4tEcDX-TC+F`uz0n=f^keviNWD%ctwd{{Xyx{Wkyr

delta 43
wcmZowo-Zmc-_60o00RHn7@VA$7@D{kQ@Cb--6)>I#dhkQgMA9u<a9wf0NYs$O8@`>

diff --git a/test/tabix/vcf_file.vcf b/test/tabix/vcf_file.vcf
index de0a7c7b6..d3cf30fc8 100644
--- a/test/tabix/vcf_file.vcf
+++ b/test/tabix/vcf_file.vcf
@@ -35,3 +35,4 @@
 2	3199812	.	G	GTT,GT	82.7	PASS	AN=4;AC=2,2	GT:GQ:DP	1/2:322:26	1/2:322:26
 3	3212016	.	CTT	C,CT	79	PASS	AN=4;AC=2,2	GT:GQ:DP	1/2:91:26	1/2:91:26
 4	3258448	.	TACACACAC	T	.	PASS	AN=4;AC=2	GT:GQ:DP	0/1:325:31	0/1:325:31
+4	3258501	.	C	A,T,G,CA,CT,CG,CC,CAA,CAT,CAG,CAC,CTA,CTT,CTG,CTC,CGA,CGT,CGG,CGC,CCA,CCT,CCG,CCC,CAAA,CAAT,CAAG,CAAC,CATA,CATT,CATG,CATC,CAGA,CAGT,CAGG,CAGC,CACA,CACT,CACG,CACC,CTAA,CTAT,CTAG,CTAC,CTTA,CTTT,CTTG,CTTC,CTGA,CTGT,CTGG,CTGC,CTCA,CTCT,CTCG,CTCC,CGAA,CGAT,CGAG,CGAC,CGTA,CGTT,CGTG,CGTC,CGGA,CGGT,CGGG,CGGC,CGCA,CGCT,CGCG,CGCC,CCAA,CCAT,CCAG,CCAC,CCTA,CCTT,CCTG,CCTC,CCGA,CCGT,CCGG,CCGC,CCCA,CCCT,CCCG,CCCC,CAAAA,CAAAT,CAAAG,CAAAC,CAATA,CAATT,CAATG,CAATC,CAAGA,CAAGT,CAAGG,CAAGC,CAACA,CAACT,CAACG,CAACC,CATAA,CATAT,CATAG,CATAC,CATTA,CATTT,CATTG,CATTC,CATGA,CATGT,CATGG,CATGC,CATCA,CATCT,CATCG,CATCC,CAGAA,CAGAT,CAGAG,CAGAC,CAGTA,CAGTT,CAGTG,CAGTC,CAGGA,CAGGT,CAGGG,CAGGC,CAGCA,CAGCT,CAGCG,CAGCC,CACAA,CACAT,CACAG,CACAC,CACTA,CACTT,CACTG,CACTC,CACGA,CACGT,CACGG,CACGC,CACCA,CACCT,CACCG,CACCC,CTAAA,CTAAT,CTAAG,CTAAC,CTATA,CTATT,CTATG,CTATC,CTAGA,CTAGT,CTAGG,CTAGC,CTACA,CTACT,CTACG,CTACC,CTTAA,CTTAT,CTTAG,CTTAC,CTTTA,CTTTT,CTTTG,CTTTC,CTTGA,CTTGT,CTTGG,CTTGC,CTTCA,CTTCT,CTTCG,CTTCC,CTGAA,CTGAT,CTGAG,CTGAC,CTGTA,CTGTT,CTGTG,CTGTC,CTGGA,CTGGT,CTGGG,CTGGC,CTGCA,CTGCT,CTGCG,CTGCC,CTCAA,CTCAT,CTCAG,CTCAC,CTCTA,CTCTT,CTCTG,CTCTC,CTCGA,CTCGT,CTCGG,CTCGC,CTCCA,CTCCT,CTCCG,CTCCC,CGAAA,CGAAT,CGAAG,CGAAC,CGATA,CGATT,CGATG,CGATC,CGAGA,CGAGT,CGAGG,CGAGC,CGACA,CGACT,CGACG,CGACC,CGTAA,CGTAT,CGTAG,CGTAC,CGTTA,CGTTT,CGTTG,CGTTC,CGTGA,CGTGT,CGTGG,CGTGC,CGTCA,CGTCT,CGTCG,CGTCC,CGGAA,CGGAT,CGGAG,CGGAC,CGGTA,CGGTT,CGGTG,CGGTC,CGGGA,CGGGT,CGGGG,CGGGC,CGGCA,CGGCT,CGGCG,CGGCC,CGCAA,CGCAT,CGCAG,CGCAC,CGCTA,CGCTT,CGCTG,CGCTC,CGCGA,CGCGT,CGCGG,CGCGC,CGCCA,CGCCT,CGCCG,CGCCC,CCAAA,CCAAT,CCAAG,CCAAC,CCATA,CCATT,CCATG,CCATC,CCAGA,CCAGT,CCAGG,CCAGC,CCACA,CCACT,CCACG,CCACC,CCTAA,CCTAT,CCTAG,CCTAC,CCTTA,CCTTT,CCTTG,CCTTC,CCTGA,CCTGT	45	PASS	AN=4;AC=2	GT	0/300	240/260

From ce7e29837d9a858832d6b5de79bddb5639d22c3f Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 16 Aug 2022 11:42:20 +0100
Subject: [PATCH 73/79] Make ksort.h use hts_defs.h instead of hts.h for
 HTSLIB_EXPORT

Revised implementation of b5cc0b7, suggested by John Marshall.
See also #1497
---
 htslib/ksort.h | 2 +-
 htslib_vars.mk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/htslib/ksort.h b/htslib/ksort.h
index fe2933bd5..7857d4c77 100644
--- a/htslib/ksort.h
+++ b/htslib/ksort.h
@@ -64,7 +64,7 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "hts.h"
+#include "hts_defs.h"
 
 #ifndef klib_unused
 #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
diff --git a/htslib_vars.mk b/htslib_vars.mk
index 2a7b29450..6af71863c 100644
--- a/htslib_vars.mk
+++ b/htslib_vars.mk
@@ -42,7 +42,7 @@ htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h)
 htslib_klist_h = $(HTSPREFIX)htslib/klist.h
 htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h
 htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h
-htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_h)
+htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_defs_h)
 htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h)
 htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h)
 htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h)

From 136e4a9b13bdcf0e2b8f2245a70f595c8c673a27 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 16 Aug 2022 16:11:38 +0100
Subject: [PATCH 74/79] Fix "make check" on MinGW / MacOS. (PR #1496)

Fixes for when "make check" is run without first doing "make".

If we haven't previously done a "make" or "make all", the .dll file
won't have been created.  The test/with-shlib.sh script will then fail
as the "cp -p ../../hts-*.dll ." command fails.  The analogous non-mingw
commands are "ln -s ../../libhts.so.* ." and similar, which don't fail in
the $? sense, but do fail with regards to produced a bogus target
containing a wild-card pattern.

On MacOS, building with `--enable-plugins` has no dependency
between the plugins and libhts.dylib.  That also caused a test failure
because test/plugins-dlhts expects to find a dylib file.

- We no longer attempt to execute this test when the external plugins
  don't exist.

- "make plugins" now explicitly builds the shared library.

- "make check" and "make test" now has "all" as a dependency.
  While strictly not everything is a dependency of running the tests,
  ensuring all the code compiles can be viewed as one of the tests to
  perform (albeit only executed the first time due to makefile
  dependency checking rules).
---
 Makefile | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index bd020bd21..918eff0f7 100644
--- a/Makefile
+++ b/Makefile
@@ -317,6 +317,9 @@ endif
 
 BUILT_PLUGINS = $(PLUGIN_OBJS:.o=$(PLUGIN_EXT))
 
+ifneq "$(BUILT_PLUGINS)" ""
+plugins: lib-shared
+endif
 plugins: $(BUILT_PLUGINS)
 
 
@@ -563,7 +566,7 @@ SRC = $(srcprefix)
 #
 # If using MSYS, avoid poor shell expansion via:
 #    MSYS2_ARG_CONV_EXCL="*" make check
-check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODECS_TEST_TARGETS)
+check test: all $(HTSCODECS_TEST_TARGETS)
 	test/hts_endian
 	test/test_expr
 	test/test_kfunc
@@ -572,8 +575,12 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC
 	test/test_time_funcs
 	test/fieldarith test/fieldarith.sam
 	test/hfile
-	HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR)
-	HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR)
+	if test "x$(BUILT_PLUGINS)" != "x"; then \
+	    HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \
+	fi
+	if test "x$(BUILT_PLUGINS)" != "x"; then \
+	  HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR); \
+	fi
 	test/test_bgzf test/bgziptest.txt
 	test/test-parse-reg -t test/colons.bam
 	cd test/sam_filter && ./filter.sh filter.tst

From 9fcd248c764c74dc6a4c1e6189a9e0c3204dce5b Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 16 Aug 2022 17:58:51 +0100
Subject: [PATCH 75/79] Improve SIMD detection

Change ax_check_compile_flag to use AC_LINK_IFELSE so that it
can't cheat by making implicit declarations of the symbols
we're testing for.

Include _mm256_extract_epi64() in the AVX2 tests.  This only
exists on X86_64, so trying to build the AVX2 code on i686 didn't
work.
---
 configure.ac                | 3 ++-
 hts_probe_cc.sh             | 3 ++-
 m4/ax_check_compile_flag.m4 | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index b848dc633..7d40948a6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -110,7 +110,8 @@ AX_CHECK_COMPILE_FLAG([-mavx2], [
   ]],[[
     __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
     __m256i b = _mm256_add_epi32(a, a);
-    return *((char *) &b);
+    long long c = _mm256_extract_epi64(b, 0);
+    return (int) c;
   ]])])
 
 dnl Options for rANS32x16 avx512 version
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index 905279099..37d6bae7e 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -71,7 +71,8 @@ cat - <<'EOF' > conftest.c
 int main(int argc, char **argv) {
     __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
     __m256i b = _mm256_add_epi32(a, a);
-    return *((char *) &b);
+    long long c = _mm256_extract_epi64(b, 0);
+    return (int) c;
 }
 EOF
 FLAGS="-mavx2"
diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4
index bd753b34d..16bb46495 100644
--- a/m4/ax_check_compile_flag.m4
+++ b/m4/ax_check_compile_flag.m4
@@ -42,7 +42,7 @@ AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
 AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
   ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
   _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
-  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+  AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
     [AS_VAR_SET(CACHEVAR,[yes])],
     [AS_VAR_SET(CACHEVAR,[no])])
   _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])

From 356c4c48a05248f440948d67cf0e792685931296 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 16 Aug 2022 18:05:34 +0100
Subject: [PATCH 76/79] Silence some overflow warnings on i686

Apparently `if (sizeof(time_t) < 8)` isn't enough to keep the
compiler from warning about the code that only runs when it's
bigger.
---
 test/test_time_funcs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c
index 9ca292f7a..0e0512988 100644
--- a/test/test_time_funcs.c
+++ b/test/test_time_funcs.c
@@ -71,7 +71,8 @@ int main(int argc, char **argv) {
     if (test_normalised(0, INT_MAX - 1000, 1000) != 0)
         return EXIT_FAILURE;
     if (sizeof(time_t) >= 8) {
-        if (test_normalised(INT_MAX - 1000, (time_t) INT_MAX * 2, 1000) != 0)
+        if (test_normalised(INT_MAX - 1000,
+                            (time_t)((int64_t) INT_MAX * 2), 1000) != 0)
             return EXIT_FAILURE;
     }
 
@@ -116,7 +117,8 @@ int main(int argc, char **argv) {
         res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1);
     } else {
         // 2038-01-19 03:14:08
-        res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) INT_MAX + 1);
+        res |= test_specific(2038, 1, 19, 3, 14, 8,
+                             (time_t)((int64_t) INT_MAX + 1));
     }
 
     return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE;

From b47469a6d322220d4412798ccaf80e4b5a8f6163 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 16 Aug 2022 18:39:42 +0100
Subject: [PATCH 77/79] Silence a gcc-10 format-truncation warning.

It gets upset because in theory the combination of prefix and
filename could be longer than the value snprintf() can return.
Trying to use a filename that long would be somewhat bizarre.
---
 hts.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hts.c b/hts.c
index 19981d83f..8b437f2b9 100644
--- a/hts.c
+++ b/hts.c
@@ -1354,6 +1354,8 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode,
     int ret = -1;
 
     if (fn2_len > sizeof(fn_buf)) {
+        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
+            return -1;
         fn2 = malloc(fn2_len);
         if (!fn2) return -1;
     }

From a1013a6e5eca6ba6b1a36b2a2a012952a5cf7df5 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 16 Aug 2022 16:41:02 +0100
Subject: [PATCH 78/79] Fix newer clang warning of unused variable.

---
 tbx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tbx.c b/tbx.c
index f0310a257..3af2c09fb 100644
--- a/tbx.c
+++ b/tbx.c
@@ -93,12 +93,11 @@ int tbx_name2id(tbx_t *tbx, const char *ss)
 
 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
 {
-    int i, b = 0, id = 1, ncols = 0;
+    int i, b = 0, id = 1;
     char *s;
     intv->ss = intv->se = 0; intv->beg = intv->end = -1;
     for (i = 0; i <= len; ++i) {
         if (line[i] == '\t' || line[i] == 0) {
-            ++ncols;
             if (id == conf->sc) {
                 intv->ss = line + b; intv->se = line + i;
             } else if (id == conf->bc) {

From fecbd67cb349896a1f858ee8d4aad320464a34de Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Wed, 17 Aug 2022 14:22:32 +0100
Subject: [PATCH 79/79] Add some more NEWS updates.

---
 NEWS | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index fd49755f4..3cc571f52 100644
--- a/NEWS
+++ b/NEWS
@@ -74,6 +74,9 @@ Noteworthy changes in release a.b
 * The missing, but trivial, `le_to_u8()` function has been added to hts_endian.
   (PR#1494, Thanks to John Marshall)
 
+* bcf_format_gt() now works properly on big-endian platforms.
+  (PR#1495, Thanks to John Marshall)
+
 Build changes
 -------------
 
@@ -82,11 +85,11 @@ These are compiler, configuration and makefile based changes.
 * Update htscodecs to version 1.3.0 for new SIMD code + various fixes.
   Updates the htscodecs submodule and adds changes necessary to make HTSlib
   build the new SIMD codec implementations.
-  (PR#1438, PR#1489)
+  (PR#1438, PR#1489, PR#1500)
 
 * Fix clang builds under mingw.  Under mingw, clang requires dllexport to be
   applied to both function declarations and function definitions.
-  (PR#1435, fixes #1433.  Reported by teepean)
+  (PR#1435, PR#1497, PR#1498 fixes #1433.  Reported by teepean)
 
 * Fix curl type warning with gcc 12.1 on Windows.
   (PR#1443)
@@ -99,6 +102,11 @@ These are compiler, configuration and makefile based changes.
   systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485.
   Thanks to John Marshall)
 
+* Fixed some Makefile dependency issues for the "check"/"test" targets
+  and plugins.  In particular, "make check" will now build the "all" target,
+  if not done already, before running the tests.
+  (PR#1496)
+
 Bug fixes
 ---------