diff --git a/.cirrus.yml b/.cirrus.yml
index 3a7b910a5..dc93b071d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -15,7 +15,8 @@ libdeflate_template: &LIBDEFLATE
       pushd "$HOME"
       git clone --depth 1 https://github.com/ebiggers/libdeflate.git
       pushd libdeflate
-      make -j 4 CFLAGS='-fPIC -O3' libdeflate.a
+      cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS='-g -O3 -fPIC'
+      cmake --build build --verbose
       popd
       popd
     fi
@@ -27,7 +28,7 @@ compile_template: &COMPILE
   compile_script: |
     git submodule update --init --recursive
     if test "x$USE_LIBDEFLATE" = "xyes"; then
-      CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate'
+      CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate/build" --with-libdeflate'
     else
       CONFIG_OPTS='--without-libdeflate'
     fi
@@ -75,6 +76,13 @@ gcc_task:
        CFLAGS: -std=c99 -pedantic -Wformat=2
        USE_LIBDEFLATE: yes
 
+  install_script: |
+    apt-get update
+    apt-get install -y --no-install-suggests --no-install-recommends     \
+        ca-certificates libc-dev make git autoconf automake              \
+        zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \
+        cmake
+
   << : *LIBDEFLATE
   << : *COMPILE
   << : *TEST
@@ -129,6 +137,7 @@ rocky_task:
     LC_ALL: C
     CIRRUS_CLONE_DEPTH: 1
     USE_CONFIG: yes
+    CFLAGS: -std=gnu90
 
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.
@@ -172,8 +181,8 @@ arm_ubuntu_task:
 
 macosx_task:
   name: macosx + clang
-  osx_instance:
-    image: monterey-base
+  macos_instance:
+    image: ghcr.io/cirruslabs/macos-ventura-base:latest
 
   environment:
     CC: clang
@@ -187,8 +196,9 @@ macosx_task:
        USE_CONFIG: yes
        USE_LIBDEFLATE: yes
 
-  package_install_script:
-    - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git
+  package_install_script: |
+    HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \
+        cmake
 
   << : *LIBDEFLATE
   << : *COMPILE
diff --git a/.gitattributes b/.gitattributes
index a14bb82b1..5d9850bc7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -20,3 +20,7 @@ README.md       export-ignore
 # Remove the text attribute from index_dos.sam, so that the line separators
 # for the test file don't get converted into Unix format.
 test/index_dos.sam -text
+
+# Remove the text attribute from various faidx test files
+test/faidx/faidx*.fa* -text
+test/faidx/fastqs*.fq* -text
diff --git a/.gitignore b/.gitignore
index 6b58e8439..1dafc3615 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,8 @@ shlib-exports-*.txt
 /bgzip
 /htsfile
 /tabix
+/test/faidx/*.tmp*
+/test/faidx/FAIL*
 /test/fieldarith
 /test/hfile
 /test/hts_endian
@@ -59,6 +61,7 @@ shlib-exports-*.txt
 /test/test-bcf_set_variant_type
 /test/test_bgzf
 /test/test_expr
+/test/test_faidx
 /test/test_index
 /test/test_introspection
 /test/test_kfunc
diff --git a/INSTALL b/INSTALL
index 2f6deacb2..dd2c3ec90 100644
--- a/INSTALL
+++ b/INSTALL
@@ -110,6 +110,7 @@ The 'make install' command installs the libraries, library header files,
 utilities, several manual pages, and a pkgconfig file to /usr/local.
 The installation location can be changed by configuring with --prefix=DIR
 or via 'make prefix=DIR install' (see Installation Locations below).
+Shared library permissions can be set via e.g. 'make install LIB_PERM=755'.
 
 
 Configuration
@@ -291,3 +292,13 @@ mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools
 mingw-w64-x86_64-tools-git
 
 (The last is only needed for building libraries compatible with MSVC.)
+
+HP-UX
+-----
+
+HP-UX requires that shared libraries have execute permission.  The
+default for HTSlib is to install with permission 644 (read-write for
+owner and read-only for group / other). This can be overridden by
+setting the LIB_PERM variable at install time with:
+
+    make install LIB_PERM=755
diff --git a/LICENSE b/LICENSE
index cf2e9f82a..925d47b40 100644
--- a/LICENSE
+++ b/LICENSE
@@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.]
 
 The MIT/Expat License
 
-Copyright (C) 2012-2022 Genome Research Ltd.
+Copyright (C) 2012-2023 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.]
 
 The Modified-BSD License
 
-Copyright (C) 2012-2022 Genome Research Ltd.
+Copyright (C) 2012-2023 Genome Research Ltd.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/Makefile b/Makefile
index 374141898..3e95a0bef 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile for htslib, a C library for high-throughput sequencing data formats.
 #
-#    Copyright (C) 2013-2022 Genome Research Ltd.
+#    Copyright (C) 2013-2023 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -39,6 +39,7 @@ CFLAGS   = -g -Wall -O2 -fvisibility=hidden
 EXTRA_CFLAGS_PIC = -fpic
 TARGET_CFLAGS =
 LDFLAGS  = -fvisibility=hidden
+VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map
 LIBS     = $(htslib_default_libs)
 
 prefix      = /usr/local
@@ -58,7 +59,8 @@ MKDIR_P = mkdir -p
 INSTALL = install -p
 INSTALL_DATA    = $(INSTALL) -m 644
 INSTALL_DIR     = $(MKDIR_P) -m 755
-INSTALL_LIB     = $(INSTALL_DATA)
+LIB_PERM        = 644
+INSTALL_LIB     = $(INSTALL) -m $(LIB_PERM)
 INSTALL_MAN     = $(INSTALL_DATA)
 INSTALL_PROGRAM = $(INSTALL)
 
@@ -80,6 +82,7 @@ BUILT_TEST_PROGRAMS = \
 	test/sam \
 	test/test_bgzf \
 	test/test_expr \
+	test/test_faidx \
 	test/test_kfunc \
 	test/test_kstring \
 	test/test_mod \
@@ -140,8 +143,8 @@ LIBHTS_SOVERSION = 3
 # is not strictly necessary and should be removed the next time
 # LIBHTS_SOVERSION is bumped (see #1144 and
 # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23)
-MACH_O_COMPATIBILITY_VERSION = 3.1.16
-MACH_O_CURRENT_VERSION = 3.1.16
+MACH_O_COMPATIBILITY_VERSION = 3.1.17
+MACH_O_CURRENT_VERSION = 3.1.17
 
 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string
 # even if this is a dirty or untagged Git working tree.
@@ -160,12 +163,20 @@ show-version:
 	@echo PACKAGE_VERSION = $(PACKAGE_VERSION)
 	@echo NUMERIC_VERSION = $(NUMERIC_VERSION)
 
+config_vars.h: override escape=$(subst ',\x27,$(subst ",\",$(subst \,\\,$(1))))
+config_vars.h: override hts_cc_escaped=$(call escape,$(CC))
+config_vars.h: override hts_cppflags_escaped=$(call escape,$(CPPFLAGS))
+config_vars.h: override hts_cflags_escaped=$(call escape,$(CFLAGS))
+config_vars.h: override hts_ldflags_escaped=$(call escape,$(LDFLAGS))
+config_vars.h: override hts_libs_escaped=$(call escape,$(LIBS))
+
 config_vars.h:
-	echo '#define HTS_CC "$(CC)"' > $@
-	echo '#define HTS_CPPFLAGS "$(CPPFLAGS)"' >> $@
-	echo '#define HTS_CFLAGS "$(CFLAGS)"' >> $@
-	echo '#define HTS_LDFLAGS "$(LDFLAGS)"' >> $@
-	echo '#define HTS_LIBS "$(LIBS)"' >> $@
+	printf '#define HTS_CC "%s"\n#define HTS_CPPFLAGS "%s"\n#define HTS_CFLAGS "%s"\n#define HTS_LDFLAGS "%s"\n#define HTS_LIBS "%s"\n' \
+	'$(hts_cc_escaped)' \
+	'$(hts_cppflags_escaped)' \
+	'$(hts_cflags_escaped)' \
+	'$(hts_ldflags_escaped)' \
+	'$(hts_libs_escaped)' > $@
 
 .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so
 
@@ -344,7 +355,7 @@ print-config:
 # file used at runtime (when $LD_LIBRARY_PATH includes the build directory).
 
 libhts.so: $(LIBHTS_OBJS:.o=.pico)
-	$(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread
+	$(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(VERSION_SCRIPT_LDFLAGS) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread
 	ln -sf $@ libhts.so.$(LIBHTS_SOVERSION)
 
 # Similarly this also creates libhts.NN.dylib as a byproduct, so that programs
@@ -494,7 +505,7 @@ htsfile: htsfile.o libhts.a
 tabix: tabix.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread
 
-bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h)
+bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h)
 htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
 tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h)
 
@@ -583,12 +594,13 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 	fi
 	test/test_bgzf test/bgziptest.txt
 	test/test-parse-reg -t test/colons.bam
+	cd test/faidx && ./test-faidx.sh faidx.tst
 	cd test/sam_filter && ./filter.sh filter.tst
 	cd test/tabix && ./test-tabix.sh tabix.tst
 	cd test/mpileup && ./test-pileup.sh mpileup.tst
 	cd test/fastq && ./test-fastq.sh
 	cd test/base_mods && ./base-mods.sh base-mods.tst
-	REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq
+	REF_PATH=: test/sam test/ce.fa test/faidx/faidx.fa test/faidx/fastqs.fq
 	test/test-regidx
 	cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-}
 
@@ -622,6 +634,9 @@ test/test_bgzf: test/test_bgzf.o libhts.a
 test/test_expr: test/test_expr.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread
 
+test/test_faidx: test/test_faidx.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread
+
 test/test_kfunc: test/test_kfunc.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
 
@@ -739,6 +754,7 @@ test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_reg
 test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h)
 test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h)
 test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h)
+test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h)
 test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h)
 test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h)
 test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h)
@@ -789,7 +805,7 @@ header-exports.txt: test/header_syms.pl htslib/*.h
 	test/header_syms.pl htslib/*.h | sort -u -o $@
 
 shlib-exports-so.txt: libhts.so
-	nm -D -g libhts.so | awk '$$2 == "T" { print $$3 }' | sort -u -o $@
+	nm -D -g libhts.so | awk '$$2 == "T" { sub("@.*", "", $$3); print $$3 }' | sort -u -o $@
 
 shlib-exports-dylib.txt: libhts.dylib
 	nm -Ug libhts.dylib | awk '$$2 == "T" { sub("^_", "", $$3); print $$3 }' | sort -u -o $@
@@ -797,6 +813,31 @@ shlib-exports-dylib.txt: libhts.dylib
 shlib-exports-dll.txt: hts.dll.a
 	nm -g hts.dll.a | awk '$$2 == "T" { print $$3 }' | sort -u -o $@
 
+$(srcprefix)htslib.map: libhts.so
+	LC_ALL=C ; export LC_ALL; \
+	curr_vers=`expr 'X$(PACKAGE_VERSION)' : 'X\([0-9]*\.[0-9.]*\)'` ; \
+	last_vers=`awk '/^HTSLIB_[0-9](\.[0-9]+)+/ { lv = $$1 } END { print lv }' htslib.map` ; \
+	if test "x$$curr_vers" = 'x' || test "x$$last_vers" = 'x' ; then \
+	    echo "Version check failed : $$curr_vers / $$las_vers" 1>&2 ; \
+	    exit 1 ; \
+	fi && \
+	if test "HTSLIB_$$curr_vers" = "$$last_vers" ; then \
+	    echo "Refusing to update $@ - HTSlib version not changed" 1>&2 ; \
+	    exit 1 ; \
+	fi && \
+	nm --with-symbol-versions -D -g libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print "    " $$3 }' > $@.tmp && \
+	if [ -s $@.tmp ] ; then \
+	    cat $@ > $@.new.tmp && \
+	    printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \
+	    cat $@.tmp >> $@.new.tmp && \
+	    printf '} %s;\n' "$$last_vers" >> $@.new.tmp && \
+	        rm -f $@.tmp && \
+	        mv $@.new.tmp $@ ; \
+	    fi ; \
+	else \
+	    rm -f $@.tmp ; \
+	fi
+
 install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig
 	$(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir)
 	if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi
@@ -845,7 +886,9 @@ htslib-uninstalled.pc: htslib.pc.tmp
 
 
 testclean:
-	-rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
+	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \
+               test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \
+               header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
 	-rm -rf htscodecs/tests/test.out
 
 # Only remove this in git checkouts
diff --git a/NEWS b/NEWS
index 7ae851a47..f6c3de965 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,128 @@
+Noteworthy changes in release 1.17 (21st February 2023)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* A new API for iterating through a BAM record's aux field.
+  (PR#1354, addresses #1319.  Thanks to John Marshall)
+
+* Text mode for bgzip. Allows bgzip to compress lines of text with block breaks
+  at newlines.
+  (PR#1493, thanks to Mike Lin for the initial version PR#1369)
+
+* Make tabix support CSI indices with large positions.  Unlike SAM and VCF
+  files, BED files do not set a maximum reference length which hindered CSI
+  support.  This change sets an arbitrary large size of 100G to enable it to
+  work.
+  (PR#1506)
+
+* Add a fai_line_length function.  Exposes the internal line-wrap length.
+  (PR#1516)
+
+* Check for invalid barcode tags in fastq output.
+  (PR#1518, fixes samtools#1728.  Reported by Poshi)
+
+* Warn if reference found in a CRAM file is not contained in the specified
+  reference file.
+  (PR#1517 and PR#1521, adds diagnostics for #1515. Reported by Wei WeiDeng)
+
+* Add a faidx_seq_len64 function that can return sequence lengths longer than
+  INT_MAX.  At the same time limit faidx_seq_len to INT_MAX output.  Also add a
+  fai_adjust_region to ensure given ranges do not go beyond the end of the
+  requested sequence.
+  (PR#1519)
+
+* Add a bcf_strerror function to give text descriptions of BCF errors.
+  (PR#1510)
+
+* Add CRAM SQ/M5 header checking when specifying a fasta file.  This is to
+  prevent creating a CRAM that cannot be decoded again.
+  (PR#1522.  In response to samtools#1748 though not a direct fix)
+
+* Improve support for very long input lines (> 2Gbyte).  This is mostly useful
+  for tabix which does not do much interpretation of its input.
+  (PR#1542, a partial fix for #1539)
+
+* Speed up load_ref_portion.  This function has been sped up by about 7x, which
+  speeds up low-depth CRAM decoding by about 10%.
+  (PR#1551)
+
+* Expand CRAM API to cope with new samtools cram_size command.
+  (PR#1546)
+
+* Merges neighbouring I and D ops into one op within pileup. This means
+  4M1D1D1D3M is reported as 4M3D3M.   Fixing this in sam.c means not only is
+  samtools mpileup now looking better, but any tool using the mpileup API will
+  be getting consistent results.
+  (PR#1552, fixes the last remaining part of samtools#139)
+
+* Update the API documentation for bgzf_mt as it refered to a previous
+  iteration.
+  (PR#1556, fixes #1553.  Reported by Raghavendra Padmanabhan)
+
+
+Build changes
+-------------
+
+* Use POSIX grep in testing as egrep and fgrep are considered obsolete.
+  (PR#1509, thanks to David Seifert)
+
+* Switch to building libdefalte with cmake for Cirris CI.
+  (PR#1511)
+
+* Ensure strings in config_vars.h are escaped correctly.
+  (PR#1530, fixes #1527. Reported by Lucas Czech)
+
+* Easier modification of shared library permissions during install.
+  (PR#1532, fixes #1525. Reported by StephDC)
+
+* Fix build on ancient compilers.  Added -std=gnu90 to build tests so older
+  C compilers will still be happy.
+  (PR#1524, fixes #1523.  Reported by Martin Jakt)
+
+* Switch MacOS CI tests to an ARM-based image.
+  (PR#1536)
+
+* Cut down the number of embed_ref=2 tests that get run.
+  (PR#1537)
+
+* Add symbol versions to libhts.so.  This is to aid package developers.
+  (PR#1560 addresses #1505, thanks to John Marshall. Reported by Stefan Bruens)
+
+* htscodecs now updated to v1.4.0.
+  (PR#1563)
+
+* Cleaned up misleading system error reports in test_bgzf.
+  (PR#1565)
+
+Bug fixes
+---------
+
+* VCF. Fix n-squared complexity in sample line with many adjacent tabs [fuzz].
+  (PR#1503)
+
+* Improved bcftools detection and reporting of bgzf decode errors.
+  (PR#1504, thanks to Lilian Janin. PR#1529 thanks to Bergur Ragnarsson, fixes
+  #1528. PR#1554)
+
+* Prevent crash when the only FASTA entry has no sequence [fuzz].
+  (PR#1507)
+
+* Fixed typo in sam.h documentation.
+  (PR#1512, thanks to kojix2)
+
+* Fix buffer read-overrun in bam_plp_insertion_mod.
+  (PR#1520)
+
+* Fix hash keys being left behind by bcf_hdr_remove.
+  (PR#1535, fixes #1533.  Reported by Giulio Genovese in #842)
+
+* Make bcf_hdr_idinfo_exists more robust by checking id value exists.
+  (PR#1544, fixes #1538.  Reported by Giulio Genovese)
+
+* CRAM improvements. Fixed crash with multi-threaded CRAM.  Fixed a bug in the
+  codec parameter learning for CRAM 3.1 name tokeniser. Fixed Cram compression
+  container substitution matrix generation,
+  (PR#1558, PR#1559 and PR#1562) 
+
 Noteworthy changes in release 1.16 (18th August 2022)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/bgzf.c b/bgzf.c
index a969b1567..468289106 100644
--- a/bgzf.c
+++ b/bgzf.c
@@ -989,6 +989,8 @@ int bgzf_read_block(BGZF *fp)
 {
     hts_tpool_result *r;
 
+    if (fp->errcode) return -1;
+
     if (fp->mt) {
     again:
         if (fp->mt->hit_eof) {
@@ -1005,11 +1007,14 @@ int bgzf_read_block(BGZF *fp)
                 if (fp->uncompressed_block == NULL) return -1;
                 fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE;
             } // else it's already allocated with malloc, maybe even in-use.
-            if (mt_destroy(fp->mt) < 0)
+            if (mt_destroy(fp->mt) < 0) {
                 fp->errcode = BGZF_ERR_IO;
+            }
             fp->mt = NULL;
             hts_tpool_delete_result(r, 0);
-
+            if (fp->errcode) {
+                return -1;
+            }
             goto single_threaded;
         }
 
@@ -1018,6 +1023,7 @@ int bgzf_read_block(BGZF *fp)
             hts_log_error("BGZF decode jobs returned error %d "
                           "for block offset %"PRId64,
                           j->errcode, j->block_address);
+            hts_tpool_delete_result(r, 0);
             return -1;
         }
 
@@ -1479,6 +1485,8 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j)
     int64_t block_address;
     block_address = htell(fp->fp);
 
+    j->block_address = block_address;  // in case we exit with j->errcode
+
     if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
     count = hpeek(fp->fp, header, sizeof(header));
     if (count == 0) // no data read
@@ -2285,11 +2293,12 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
             fp->block_length = 0;
         }
     } while (state == 0);
+    if (state < -1) return state;
     if (str->l == 0 && state < 0) return state;
     fp->uncompressed_address += str->l + 1;
     if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
     str->s[str->l] = 0;
-    return str->l;
+    return str->l <= INT_MAX ? (int) str->l : INT_MAX;
 }
 
 void bgzf_index_destroy(BGZF *fp)
diff --git a/bgzip.1 b/bgzip.1
index 1ada36630..abf9fcedb 100644
--- a/bgzip.1
+++ b/bgzip.1
@@ -1,10 +1,10 @@
-.TH bgzip 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools"
+.TH bgzip 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools"
 .SH NAME
 .PP
 bgzip \- Block compression/decompression utility
 .\"
 .\" Copyright (C) 2009-2011 Broad Institute.
-.\" Copyright (C) 2018, 2021 Genome Research Limited.
+.\" Copyright (C) 2018, 2021-2022 Genome Research Limited.
 .\"
 .\" Author: Heng Li <lh3@sanger.ac.uk>
 .\"
@@ -74,6 +74,17 @@ after decompression completes the input file will be removed.
 
 .SH OPTIONS
 .TP 10
+.B "--binary"
+Bgzip will attempt to ensure BGZF blocks end on a newline when the
+input is a text file.  The exception to this is where a single line is
+larger than a BGZF block (64Kb).  This can aid tools that use the
+index to perform random access on the compressed stream, as the start
+of a block is likely to also be the start of a text record.
+
+This option processes text files as if they were binary content,
+ignoring the location of newlines.  This also restores the behaviour
+for text files to bgzip version 1.15 and earlier.
+.TP
 .BI "-b, --offset " INT
 Decompress to standard output from virtual file position (0-based uncompressed
 offset).
@@ -92,7 +103,8 @@ Use \fB--force\fR twice to do both without asking.
 .TP
 .B "-g, --rebgzip"
 Try to use an existing index to create a compressed file with matching
-block offsets.
+block offsets.  The index must be specified using the \fB-I
+\fIfile.gzi\fR option.
 Note that this assumes that the same compression library and level are in use
 as when making the original file.
 Don't use it unless you know what you're doing.
@@ -181,6 +193,5 @@ The BGZF library was originally implemented by Bob Handsaker and modified
 by Heng Li for remote file access and in-memory caching.
 
 .SH SEE ALSO
-.PP
-.BR gzip (1),
-.BR tabix (1)
+.IR gzip (1),
+.IR tabix (1)
diff --git a/bgzip.c b/bgzip.c
index bd0374811..589f79f66 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -1,7 +1,7 @@
 /* bgzip.c -- Block compression/decompression utility.
 
    Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
-   Copyright (C) 2010, 2013-2019, 2021 Genome Research Ltd.
+   Copyright (C) 2010, 2013-2019, 2021-2022 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -36,13 +36,14 @@
 #include <inttypes.h>
 #include "htslib/bgzf.h"
 #include "htslib/hts.h"
+#include "htslib/hfile.h"
 
 #ifdef _WIN32
 #  define WIN32_LEAN_AND_MEAN
 #  include <windows.h>
 #endif
 
-static const int WINDOW_SIZE = 64 * 1024;
+static const int WINDOW_SIZE = BGZF_BLOCK_SIZE;
 
 static void error(const char *format, ...)
 {
@@ -121,15 +122,16 @@ static int bgzip_main_usage(FILE *fp, int status)
     fprintf(fp, "   -r, --reindex              (re)index compressed file\n");
     fprintf(fp, "   -s, --size INT             decompress INT bytes (uncompressed size)\n");
     fprintf(fp, "   -t, --test                 test integrity of compressed file\n");
+    fprintf(fp, "       --binary               Don't align blocks with text lines\n");
     fprintf(fp, "   -@, --threads INT          number of compression threads to use [1]\n");
     return status;
 }
 
 int main(int argc, char **argv)
 {
-    int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep;
+    int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary;
     BGZF *fp;
-    void *buffer;
+    char *buffer;
     long start, end, size;
     char *index_fname = NULL;
     int threads = 1;
@@ -151,10 +153,11 @@ int main(int argc, char **argv)
         {"test", no_argument, NULL, 't'},
         {"version", no_argument, NULL, 1},
         {"keep", no_argument, NULL, 'k'},
+        {"binary", no_argument, NULL, 2},
         {NULL, 0, NULL, 0}
     };
 
-    compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0;
+    compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0;
     while((c  = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){
         switch(c){
         case 'd': compress = 0; break;
@@ -173,8 +176,9 @@ int main(int argc, char **argv)
         case 1:
             printf(
 "bgzip (htslib) %s\n"
-"Copyright (C) 2022 Genome Research Ltd.\n", hts_version());
+"Copyright (C) 2023 Genome Research Ltd.\n", hts_version());
             return EXIT_SUCCESS;
+        case  2:  binary = 1; break;
         case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS);
         case '?': return bgzip_main_usage(stderr, EXIT_FAILURE);
         }
@@ -185,7 +189,7 @@ int main(int argc, char **argv)
         return 1;
     }
     if (compress == 1) {
-        int f_src = fileno(stdin);
+        hFILE* f_src = NULL;
         char out_mode[3] = "w\0";
         char out_mode_exclusive[4] = "wx\0";
 
@@ -198,13 +202,13 @@ int main(int argc, char **argv)
             out_mode_exclusive[2] = compress_level + '0';
         }
 
+        if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) {
+            fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+            return 1;
+        }
+
         if ( argc>optind )
         {
-            if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
-                fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
-                return 1;
-            }
-
             if (pstdout)
                 fp = bgzf_open("-", out_mode);
             else
@@ -241,7 +245,7 @@ int main(int argc, char **argv)
 
         if ( rebgzip && !index_fname )
         {
-            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
+            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout.  See -I option.\n");
             return 1;
         }
 
@@ -250,18 +254,103 @@ int main(int argc, char **argv)
             bgzf_mt(fp, threads, 256);
 
         buffer = malloc(WINDOW_SIZE);
-#ifdef _WIN32
-        _setmode(f_src, O_BINARY);
-#endif
+        if (!buffer)
+            return 1;
         if (rebgzip){
             if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
 
-            while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
+            while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
                 if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
         }
         else {
-            while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
-                if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
+            htsFormat fmt;
+            int textual = 0;
+            if (!binary
+                && hts_detect_format(f_src, &fmt) == 0
+                && fmt.compression == no_compression) {
+                switch(fmt.format) {
+                case text_format:
+                case sam:
+                case vcf:
+                case bed:
+                case fasta_format:
+                case fastq_format:
+                case fai_format:
+                case fqi_format:
+                    textual = 1;
+                    break;
+                default: break; // silence clang warnings
+                }
+            }
+
+            if (binary || !textual) {
+                // Binary data, either detected or explicit
+                while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
+                    if (bgzf_write(fp, buffer, c) < 0)
+                        error("Could not write %d bytes: Error %d\n",
+                              c, fp->errcode);
+            } else {
+                /* Text mode, try a flush after a newline */
+                int in_header = 1, n = 0, long_line = 0;
+                while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) {
+                    int c2 = c+n;
+                    int flush = 0;
+                    if (in_header &&
+                        (long_line || buffer[0] == '@' || buffer[0] == '#')) {
+                        // Scan forward to find the last header line.
+                        int last_start = 0;
+                        n = 0;
+                        while (n < c2) {
+                            if (buffer[n++] != '\n')
+                                continue;
+
+                            last_start = n;
+                            if (n < c2 &&
+                                !(buffer[n] == '@' || buffer[n] == '#')) {
+                                in_header = 0;
+                                break;
+                            }
+                        }
+                        if (!last_start) {
+                            n = c2;
+                            long_line = 1;
+                        } else {
+                            n = last_start;
+                            flush = 1;
+                            long_line = 0;
+                        }
+                    } else {
+                        // Scan backwards to find the last newline.
+                        n += c; // c read plus previous n overflow
+                        while (--n >= 0 && ((char *)buffer)[n] != '\n')
+                            ;
+
+                        if (n >= 0) {
+                            flush = 1;
+                            n++;
+                        } else {
+                            n = c2;
+                        }
+                    }
+
+                    // Pos n is either at the end of the buffer with flush==0,
+                    // or the first byte after a newline and a flush point.
+                    if (bgzf_write(fp, buffer, n) < 0)
+                        error("Could not write %d bytes: Error %d\n",
+                              n, fp->errcode);
+                    if (flush)
+                        if (bgzf_flush_try(fp, 65536) < 0) // force
+                            return -1;
+
+                    memmove(buffer, buffer+n, c2-n);
+                    n = c2-n;
+                }
+
+                // Trailing data.
+                if (bgzf_write(fp, buffer, n) < 0)
+                    error("Could not write %d bytes: Error %d\n",
+                          n, fp->errcode);
+            }
         }
         if ( index )
         {
@@ -270,13 +359,16 @@ int main(int argc, char **argv)
                     error("Could not write index to '%s'\n", index_fname);
             } else {
                 if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
-                    error("Could not write index to '%s.gz.gzi'", argv[optind]);
+                    error("Could not write index to '%s.gz.gzi'\n",
+                          argv[optind]);
             }
         }
-        if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
+        if (bgzf_close(fp) < 0)
+            error("Output close failed: Error %d\n", fp->errcode);
+        if (hclose(f_src) < 0)
+            error("Input close failed\n");
         if (argc > optind && !pstdout && !keep) unlink(argv[optind]);
         free(buffer);
-        close(f_src);
         return 0;
     }
     else if ( reindex )
diff --git a/config.mk.in b/config.mk.in
index 82af49850..7341a170d 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -1,6 +1,6 @@
 #  Optional configure Makefile overrides for htslib.
 #
-#    Copyright (C) 2015-2017, 2019 Genome Research Ltd.
+#    Copyright (C) 2015-2017, 2019, 2023 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -43,6 +43,7 @@ RANLIB = @RANLIB@
 CPPFLAGS = @CPPFLAGS@
 CFLAGS   = @CFLAGS@
 LDFLAGS  = @LDFLAGS@
+VERSION_SCRIPT_LDFLAGS = @VERSION_SCRIPT_LDFLAGS@
 LIBS     = @LIBS@
 
 PLATFORM   = @PLATFORM@
diff --git a/configure.ac b/configure.ac
index 7d40948a6..98b0a44c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
 # Configure script for htslib, a C library for high-throughput sequencing data.
 #
-#    Copyright (C) 2015-2022 Genome Research Ltd.
+#    Copyright (C) 2015-2023 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -155,6 +155,11 @@ static_LDFLAGS=$LDFLAGS
 static_LIBS='-lpthread -lz -lm'
 private_LIBS=$LDFLAGS
 
+AC_ARG_ENABLE([versioned-symbols],
+  [AS_HELP_STRING([--disable-versioned-symbols],
+                  [disable versioned symbols in shared library])],
+  [], [enable_versioned_symbols=yes])
+
 AC_ARG_ENABLE([bz2],
   [AS_HELP_STRING([--disable-bz2],
                   [omit support for BZ2-compressed CRAM files])],
@@ -249,6 +254,24 @@ esac
 AC_MSG_RESULT([$host_result])
 AC_SUBST([PLATFORM])
 
+dnl Check for versioned symbol support
+dnl Only try for .so shared libraries as other types won't work
+AS_IF([test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes],
+  [AC_CACHE_CHECK([whether the linker supports versioned symbols],
+    [hts_cv_have_versioned_symbols], [
+      save_LDFLAGS=$LDFLAGS
+      LDFLAGS="-Wl,-version-script,$srcdir/htslib.map $LDFLAGS"
+      AC_LINK_IFELSE([AC_LANG_PROGRAM()],
+                     [hts_cv_have_versioned_symbols=yes],
+                     [hts_cv_have_versioned_symbols=no])
+      LDFLAGS=$save_LDFLAGS
+    ])
+   AS_IF([test "x$hts_cv_have_versioned_symbols" = xyes],[
+     VERSION_SCRIPT_LDFLAGS='-Wl,-version-script,$(srcprefix)htslib.map'
+     AC_SUBST([VERSION_SCRIPT_LDFLAGS])
+   ])
+])
+
 dnl Try to get more control over which symbols are exported in the shared
 dnl library.
 HTS_HIDE_DYNAMIC_SYMBOLS
diff --git a/cram/README b/cram/README
new file mode 100644
index 000000000..135438227
--- /dev/null
+++ b/cram/README
@@ -0,0 +1,214 @@
+CRAM encoding internals
+=======================
+
+A quick summary of functions involved.
+
+The encoder works by accumulating a bunch of BAM records (via the
+cram_put_bam_seq function), and at a certain point (eg counter of
+records, or switching reference) the array of BAM records it turned
+into a container, which in turn creates slices, holding CRAM
+data-series in blocks.  The function that turns an array of BAM
+objects into the container is below.
+
+cram_encode_container func:
+    Validate references MD5 against header, unless no_ref mode
+    If embed_ref <= 1, fetch ref
+        Switch to embed_ref=2 if failed
+
+    Foreach slice:
+        If embed_ref == 2
+	    call cram_generate_reference
+	        if failed switch to no_ref mode
+	Foreach sequence
+	    call process_one_read to append BAM onto each data series (DS)
+	        call cram_stats_add for each DS to gather metrics
+		call cram_encode_aux
+
+    # We now have cram DS, per slice
+    call cram_encoder_init, per DS (based on cram_stats_add data)
+
+    Foreach slice:
+        call cram_encode_slice to turn DS to blocks
+	    call cram_compess_slice
+
+    call cram_encode_compression_header
+
+Threading
+---------
+
+CRAM can be multi-threaded, but this brings complications.
+
+The above function is the main CPU user, so it is this bit which can
+be executed in parallel from multiple threads.  To understand this we
+need to now look at how the primary loop works when writing a CRAM:
+
+Encoding main thread:
+    repeatedly calls cram_put_bam_seq
+        calls cram_new_container on first time through to initialise
+	calls cram_next_container when current is full or we need to flush
+	    calls cram_flush_container_mt to flush last container
+        pushes BAM object onto current container
+
+If non-threaded, cram_flush_container_mt does:
+    call cram_flush_container
+        call cram_encode_container to go from BAM to CRAM data-series
+ 	call cram_flush_container2 (writes it out)
+
+If threaded, cram_flush_container_mt does:
+    Main: Dispatch cram_flush_thread job
+        Thread: call cram_encode_container to go from BAM to CRAM data-series
+    Main: Call cram_flush_result to drain queue of encoded containers
+        Main: Call cram_flush_container2 (writes it out);
+
+
+
+Decisions on when to create new containers, detection of sorted vs unsorted,
+switching to multi-seq mode, etc occur at the main thread in
+cram_put_bam_seq.
+
+We can change our mind on container parameters at any point up until
+the cram_encode_container call.  At that point these parameters get
+baked into a container compression header and all data-series
+generated need to be in sync with the parameters.
+
+It is possible that some parameter changes can get detected while
+encoding the container, as it is there where we fetch references.  Eg
+the need to enable embedded reference or switch to non-ref mode.
+
+While encoding a container, we can change the parameters for *this*
+container, and we can also set the default parameter for subsequent
+new parameters via the global cram fd to avoid spamming attempts to
+load a reference which doesn't exist, but we cannot change other
+containers that are being processed in parallel.  They'll fend for
+themselves.
+
+References
+----------
+
+To avoid spamming the reference servers, there is a shared cache of
+references being currently used by all the worker threads (leading to
+confusing terminology of reference-counting of references).  So each
+container fetches its section of reference, but the memory for that is
+handled via its own layer.
+
+The shared references and ref meta-data is held in cram_fd -> refs (a
+refs_t pointer):
+
+    // References structure.
+    struct refs_t {
+        string_alloc_t *pool;  // String pool for holding filenames and SN vals
+    
+        khash_t(refs) *h_meta; // ref_entry*, index by name
+        ref_entry **ref_id;    // ref_entry*, index by ID
+        int nref;              // number of ref_entry
+    
+        char *fn;              // current file opened
+        BGZF *fp;              // and the hFILE* to go with it.
+    
+        int count;             // how many cram_fd sharing this refs struct
+    
+        pthread_mutex_t lock;  // Mutex for multi-threaded updating
+        ref_entry *last;       // Last queried sequence
+        int last_id;           // Used in cram_ref_decr_locked to delay free
+    };
+
+Within this, ref_entry is the per-reference information:
+
+    typedef struct ref_entry {
+        char *name;
+        char *fn;
+        int64_t length;
+        int64_t offset;
+        int bases_per_line;
+        int line_length;
+        int64_t count;     // for shared references so we know to dealloc seq
+        char *seq;
+        mFILE *mf;
+        int is_md5;        // Reference comes from a raw seq found by MD5
+        int validated_md5;
+    } ref_entry;
+
+Sharing of references to track use between threads is via
+cram_ref_incr* and cram_ref_decr* (which locked and unlocked
+variants).  We free a reference when the usage count hits zero.  To
+avoid spamming discard and reload in single-thread creation of a
+pos-sorted CRAM, we keep track of the last reference in cram_fd and
+delay discard by one loop iteration.
+
+There are complexities here around whether the references come from a
+single ref.fa file, are from a local MD5sum cache with one file per
+reference (mmapped), or whether they're fetched from some remote
+REF_PATH query such as the EBI.  (This later case typically downloads
+to a local md5 based ref-cache first and mmaps from there.)
+
+The refs struct start off by being populated from the SAM header.  We
+have M5 tag and name known, maybe a filename, but length is 0 and seq
+is NULL.  This is done by cram_load_reference:
+
+cram_load_reference (cram_fd, filename):
+    if filename non-NULL
+        call refs_load_fai
+	    Populates ref_entry with filename, name, length, line-len, etc
+    	sanitise_SQ_lines
+    If no refs loaded
+        call refs_from_header
+	    populates ref_entry with name.
+	    Sets length=0 as marker for not-yet-loaded
+
+The main interface used from the code is cram_get_ref().  It takes a
+reference ID, start and end coordinate and returns a pointer to the
+relevant sub-sequence.
+
+cram_get_ref:
+    r = fd->refs->ref_id[id];    // current ref
+    call cram_populate_ref if stored length is 0 (ie ref.fa set)
+        search REF_PATH / REF_CACHE
+	call bgzf_open if local_path
+	call open_path_mfile otherwise
+	copy to local REF_CACHE if required (eg remote fetch)
+
+    If start = 1 and end = ref-length
+       If ref seq unknown
+           call cram_ref_load to load entire ref and use that
+
+    If ref seq now known, return it
+
+    // Otherwise known via .fai or we've errored by now.
+    call load_ref_portion to return a sub-seq from index fasta
+
+The encoder asks for the entire reference rather than a small portion
+of it as we're usually encoding a large amount.  The decoder may be
+dealing with small range queries, so it only asks for the relevant
+sub-section of reference as specified in the cram slice headers.
+
+
+TODO
+====
+
+- Multi-ref mode is enabled when we have too many small containers in
+  a row.
+
+  Instead of firing off new containers when we switch reference, we
+  could always make a new container after N records, separating off
+  M <= N to make the container such that all M are the same reference,
+  and shuffling any remaining N-M down as the start of the next.
+
+  This means we can detect how many new containers we would create,
+  and enable multi-ref mode straight away rather than keeping a recent
+  history of how many small containers we've emitted.
+
+- The cache of references currently being used is a better place to
+  track the global embed-ref and non-ref logic.  Better than cram_fd.
+  Cram_fd is a one-way change, as once we enable non-ref we'll stick
+  with it.
+
+  However if it was per-ref in the ref-cache then we'd probe and try
+  each reference once, and then all new containers for that ref would
+  honour the per-ref parameters.  So a single missing reference in the
+  middle of a large file wouldn't change behaviour for all subsequence
+  references.
+
+  Optionally we could still do meta-analysis on how many references
+  are failing, and switch the global cram_fd params to avoid repeated
+  testing of reference availability if it's becoming obvious that none
+  of them are known.
diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c
index 33e1b5bf8..21240c141 100644
--- a/cram/cram_codecs.c
+++ b/cram/cram_codecs.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2021 Genome Research Ltd.
+Copyright (c) 2012-2021,2023 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -447,6 +447,11 @@ cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
     return cram_get_block_by_id(slice, c->u.external.content_id);
 }
 
+int cram_external_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "EXTERNAL(id=%d)",
+                    c->u.external.content_id) < 0 ? -1 : 0;
+}
+
 cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
                                       char *data, int size,
                                       enum cram_encoding codec,
@@ -494,6 +499,7 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
     c->free   = cram_external_decode_free;
     c->size   = cram_external_decode_size;
     c->get_block = cram_external_get_block;
+    c->describe = cram_external_describe;
 
     c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
 
@@ -739,6 +745,14 @@ cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
     return cram_get_block_by_id(slice, c->u.varint.content_id);
 }
 
+int cram_varint_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
+                    c->u.varint.content_id,
+                    c->u.varint.offset,
+                    c->u.varint.type)
+        < 0 ? -1 : 0;
+}
+
 cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
                                     char *data, int size,
                                     enum cram_encoding codec,
@@ -774,6 +788,7 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
     c->free   = cram_varint_decode_free;
     c->size   = cram_varint_decode_size;
     c->get_block = cram_varint_get_block;
+    c->describe = cram_varint_describe;
 
     c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
     c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
@@ -942,6 +957,11 @@ int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
     return 0;
 }
 
+int cram_const_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "CONST(val=%"PRId64")",
+                    c->u.xconst.val) < 0 ? -1 : 0;
+}
+
 cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
                                    char *data, int size,
                                    enum cram_encoding codec,
@@ -963,6 +983,7 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
     c->free   = cram_const_decode_free;
     c->size   = cram_const_decode_size;
     c->get_block = NULL;
+    c->describe = cram_const_describe;
 
     c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
 
@@ -1091,6 +1112,12 @@ void cram_beta_decode_free(cram_codec *c) {
         free(c);
 }
 
+int cram_beta_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
+                    c->u.beta.offset, c->u.beta.nbits)
+        < 0 ? -1 : 0;
+}
+
 cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
                                   char *data, int size,
                                   enum cram_encoding codec,
@@ -1115,6 +1142,7 @@ cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
         return NULL;
     }
     c->free   = cram_beta_decode_free;
+    c->describe = cram_beta_describe;
 
     c->u.beta.nbits = -1;
     c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
@@ -1405,6 +1433,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
     c->free = cram_xpack_decode_free;
     c->size = cram_xpack_decode_size;
     c->get_block = cram_xpack_get_block;
+    c->describe = NULL;
 
     c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
     c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
@@ -1735,6 +1764,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
     c->free = cram_xdelta_decode_free;
     c->size = cram_xdelta_decode_size;
     c->get_block = cram_xdelta_get_block;
+    c->describe = NULL;
 
     c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
     c->u.xdelta.last = 0;
@@ -2135,6 +2165,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
     c->free   = cram_xrle_decode_free;
     c->size   = cram_xrle_decode_size;
     c->get_block = cram_xrle_get_block;
+    c->describe = NULL;
     c->u.xrle.cur_len = 0;
     c->u.xrle.cur_lit = -1;
 
@@ -2423,6 +2454,13 @@ void cram_subexp_decode_free(cram_codec *c) {
         free(c);
 }
 
+int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
+                    c->u.subexp.offset,
+                    c->u.subexp.k)
+        < 0 ? -1 : 0;
+}
+
 cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
                                     char *data, int size,
                                     enum cram_encoding codec,
@@ -2442,6 +2480,7 @@ cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
     c->codec  = E_SUBEXP;
     c->decode = cram_subexp_decode;
     c->free   = cram_subexp_decode_free;
+    c->describe = cram_subexp_describe;
     c->u.subexp.k = -1;
 
     c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
@@ -2489,6 +2528,11 @@ void cram_gamma_decode_free(cram_codec *c) {
         free(c);
 }
 
+int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
+        < 0 ? -1 : 0;
+}
+
 cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
                                    char *data, int size,
                                    enum cram_encoding codec,
@@ -2511,6 +2555,7 @@ cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
     c->codec  = E_GAMMA;
     c->decode = cram_gamma_decode;
     c->free   = cram_gamma_decode_free;
+    c->describe = cram_gamma_describe;
 
     c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
 
@@ -2703,6 +2748,22 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
     return 0;
 }
 
+int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
+    int r = 0, n;
+    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
+    for (n = 0; n < c->u.huffman.ncodes; n++) {
+        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
+                      c->u.huffman.codes[n].symbol);
+    }
+    r |= ksprintf(ks, "},lengths={") < 0;
+    for (n = 0; n < c->u.huffman.ncodes; n++) {
+        r |= ksprintf(ks, "%s%d", n?",":"",
+                      c->u.huffman.codes[n].len);
+    }
+    r |= ksprintf(ks, "})") < 0;
+    return r;
+}
+
 /*
  * Initialises a huffman decoder from an encoding data stream.
  */
@@ -2865,6 +2926,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
     } else {
         return NULL;
     }
+    h->describe = cram_huffman_describe;
 
     return (cram_codec *)h;
 
@@ -3293,6 +3355,22 @@ void cram_byte_array_len_decode_free(cram_codec *c) {
     free(c);
 }
 
+int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
+    int r = 0;
+    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
+    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
+    r |=  l->len_codec->describe
+        ? l->len_codec->describe(l->len_codec, ks)
+        : (ksprintf(ks, "?")<0);
+    r |= ksprintf(ks, "},val_codec={") < 0;
+    r |=  l->val_codec->describe
+        ? l->val_codec->describe(l->val_codec, ks)
+        : (ksprintf(ks, "?")<0);
+    r |= ksprintf(ks, "}") < 0;
+
+    return r;
+}
+
 cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
                                             char *data, int size,
                                             enum cram_encoding codec,
@@ -3308,6 +3386,7 @@ cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
     c->codec  = E_BYTE_ARRAY_LEN;
     c->decode = cram_byte_array_len_decode;
     c->free   = cram_byte_array_len_decode_free;
+    c->describe = cram_byte_array_len_describe;
     c->u.byte_array_len.len_codec = NULL;
     c->u.byte_array_len.val_codec = NULL;
 
@@ -3532,6 +3611,13 @@ void cram_byte_array_stop_decode_free(cram_codec *c) {
     free(c);
 }
 
+int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
+    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
+                    c->u.byte_array_stop.stop,
+                    c->u.byte_array_stop.content_id)
+        < 0 ? -1 : 0;
+}
+
 cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
                                              char *data, int size,
                                              enum cram_encoding codec,
@@ -3561,6 +3647,7 @@ cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
         return NULL;
     }
     c->free   = cram_byte_array_stop_decode_free;
+    c->describe = cram_byte_array_stop_describe;
 
     c->u.byte_array_stop.stop = *cp++;
     if (CRAM_MAJOR_VERS(version) == 1) {
@@ -4032,3 +4119,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
 
     return 0;
 }
+
+int cram_codec_describe(cram_codec *c, kstring_t *ks) {
+    if (c && c->describe)
+        return c->describe(c, ks);
+    else
+        return ksprintf(ks, "?");
+}
diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h
index 56b065255..d93d9955c 100644
--- a/cram/cram_codecs.h
+++ b/cram/cram_codecs.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2015, 2018, 2020 Genome Research Ltd.
+Copyright (c) 2012-2015, 2018, 2020, 2023 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -160,7 +160,7 @@ typedef struct {
 /*
  * A generic codec structure.
  */
-typedef struct cram_codec {
+struct cram_codec {
     enum cram_encoding codec;
     cram_block *out;
     varint_vec *vv;
@@ -175,6 +175,7 @@ typedef struct cram_codec {
     int (*size)(cram_slice *slice, struct cram_codec *codec);
     int (*flush)(struct cram_codec *codec);
     cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec);
+    int (*describe)(struct cram_codec *codec, kstring_t *ks);
 
     union {
         cram_huffman_decoder         huffman;
@@ -201,7 +202,7 @@ typedef struct cram_codec {
         cram_const_codec             e_xconst;
         cram_varint_decoder          e_varint;
     } u;
-} cram_codec;
+};
 
 const char *cram_encoding2str(enum cram_encoding t);
 
diff --git a/cram/cram_decode.c b/cram/cram_decode.c
index 1f8d60f12..73f567106 100644
--- a/cram/cram_decode.c
+++ b/cram/cram_decode.c
@@ -2438,8 +2438,9 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
             }
         }
 
-        if ((!s->ref && s->hdr->ref_base_id < 0)
-            || memcmp(digest, s->hdr->md5, 16) != 0) {
+        if (!c->comp_hdr->no_ref &&
+            ((!s->ref && s->hdr->ref_base_id < 0)
+             || memcmp(digest, s->hdr->md5, 16) != 0)) {
             char M[33];
             const char *rname = sam_hdr_tid2name(sh, ref_id);
             if (!rname) rname="?"; // cannot happen normally
diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 1ba1988f4..5b56aedd5 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2020, 2022 Genome Research Ltd.
+Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
 static int process_one_read(cram_fd *fd, cram_container *c,
                             cram_slice *s, cram_record *cr,
                             bam_seq_t *b, int rnum, kstring_t *MD,
-                            int embed_ref);
+                            int embed_ref, int no_ref);
 
 /*
  * Returns index of val into key.
@@ -70,7 +70,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 static int sub_idx(char *key, char val) {
     int i;
 
-    for (i = 0; *key && *key++ != val; i++);
+    for (i = 0; i < 4 && *key++ != val; i++);
     return i;
 }
 
@@ -87,6 +87,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
     cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
     int i, mc, r = 0;
 
+    int no_ref = c->no_ref;
+
     if (!cb || !map)
         return NULL;
 
@@ -162,7 +164,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
                 kh_val(h->preservation_map, k).i = h->qs_seq_orient;
             }
 
-            if (fd->no_ref || embed_ref>0) {
+            if (no_ref || embed_ref>0) {
                 // Reference Required == No
                 k = kh_put(map, h->preservation_map, "RR", &r);
                 if (-1 == r) return NULL;
@@ -203,31 +205,38 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
 
             case CRAM_KEY('S','M'): {
                 char smat[5], *mp = smat;
+                // Output format is for order ACGTN (minus ref base)
+                // to store the code value 0-3 for each symbol.
+                //
+                // Note this is different to storing the symbols in order
+                // that the codes occur from 0-3, which is what we used to
+                // do.  (It didn't matter as we always had a fixed table in
+                // the order.)
                 *mp++ =
-                    (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
-                    (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
-                    (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
-                    (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
+                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
+                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
+                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
+                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
                 *mp++ =
-                    (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
-                    (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
-                    (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
-                    (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
+                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
+                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
+                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
+                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
                 *mp++ =
-                    (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
-                    (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
-                    (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
-                    (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
+                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
+                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
+                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
+                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
                 *mp++ =
-                    (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
-                    (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
-                    (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
-                    (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
+                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
+                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
+                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
+                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
                 *mp++ =
-                    (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
-                    (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
-                    (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
-                    (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
+                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
+                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
+                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
+                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
                 BLOCK_APPEND(map, smat, 5);
                 break;
             }
@@ -1688,6 +1697,54 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
     return -1;
 }
 
+// Check if the SQ M5 tag matches the reference we've loaded.
+static int validate_md5(cram_fd *fd, int ref_id) {
+    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
+        return 0;
+
+    // Have we already checked this ref?
+    if (fd->refs->ref_id[ref_id]->validated_md5)
+        return 0;
+
+    // Check if we have the MD5 known.
+    // We should, but maybe we're using embedded references?
+    sam_hrecs_t *hrecs = fd->header->hrecs;
+    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
+                                                 hrecs->ref[ref_id].name);
+    if (!ty)
+        return 0;
+
+    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
+    if (!m5tag)
+        return 0;
+
+    // It's known, so compute md5 on the loaded reference sequence.
+    char *ref = fd->refs->ref_id[ref_id]->seq;
+    int64_t len = fd->refs->ref_id[ref_id]->length;
+    hts_md5_context *md5;
+    char unsigned buf[16];
+    char buf2[33];
+
+    if (!(md5 = hts_md5_init()))
+        return -1;
+    hts_md5_update(md5, ref, len);
+    hts_md5_final(buf, md5);
+    hts_md5_destroy(md5);
+    hts_md5_hex(buf2, buf);
+
+    // Compare it to header @SQ M5 tag
+    if (strcmp(m5tag->str+3, buf2)) {
+        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
+                      hrecs->ref[ref_id].name);
+        hts_log_error("Please use the correct reference, or "
+                      "consider using embed_ref=2");
+        return -1;
+    }
+    fd->refs->ref_id[ref_id]->validated_md5 = 1;
+
+    return 0;
+}
+
 /*
  * Encodes all slices in a container into blocks.
  * Returns 0 on success
@@ -1698,7 +1755,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     cram_block_compression_hdr *h = c->comp_hdr;
     cram_block *c_hdr;
     int multi_ref = 0;
-    int r1, r2, sn, nref, embed_ref;
+    int r1, r2, sn, nref, embed_ref, no_ref;
     spare_bams *spares;
 
     if (CRAM_MAJOR_VERS(fd->version) == 1)
@@ -1707,22 +1764,17 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
 //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
 #define goto_err goto err
 
+ restart:
     /* Cache references up-front if we have unsorted access patterns */
     pthread_mutex_lock(&fd->ref_lock);
     nref = fd->refs->nref;
-    embed_ref = fd->embed_ref;
     pthread_mutex_unlock(&fd->ref_lock);
-
-    if (!fd->no_ref && c->refs_used) {
-        for (i = 0; i < nref; i++) {
-            if (c->refs_used[i])
-                cram_get_ref(fd, i, 1, 0);
-        }
-    }
+    embed_ref = c->embed_ref;
+    no_ref = c->no_ref;
 
     /* To create M5 strings */
     /* Fetch reference sequence */
-    if (!fd->no_ref) {
+    if (!no_ref) {
         if (!c->bams || !c->bams[0])
             goto_err;
         bam_seq_t *b = c->bams[0];
@@ -1730,7 +1782,20 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         if (embed_ref <= 1) {
             char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
             if (!ref && bam_ref(b) >= 0) {
-                if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) {
+                if (!c->pos_sorted) {
+                    // TODO: maybe also check fd->no_ref?
+                    hts_log_warning("Failed to load reference #%d",
+                                    bam_ref(b));
+                    hts_log_warning("Switching to non-ref mode");
+
+                    pthread_mutex_lock(&fd->ref_lock);
+                    c->embed_ref = fd->embed_ref = 0;
+                    c->no_ref = fd->no_ref = 1;
+                    pthread_mutex_unlock(&fd->ref_lock);
+                    goto restart;
+                }
+
+                if (c->multi_seq || embed_ref == 0) {
                     hts_log_error("Failed to load reference #%d", bam_ref(b));
                     return -1;
                 }
@@ -1741,9 +1806,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                     hts_log_warning("NOTE: the CRAM file will be bigger than"
                                     " using an external reference");
                 pthread_mutex_lock(&fd->ref_lock);
-                embed_ref = fd->embed_ref = 2;
+                embed_ref = c->embed_ref = fd->embed_ref = 2;
                 pthread_mutex_unlock(&fd->ref_lock);
                 goto auto_ref;
+            } else if (ref) {
+                if (validate_md5(fd, c->ref_seq_id) < 0)
+                    goto_err;
             }
             if ((c->ref_id = bam_ref(b)) >= 0) {
                 c->ref_seq_id = c->ref_id;
@@ -1768,6 +1836,21 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         c->ref_seq_id = c->ref_id;
     }
 
+    if (!no_ref && c->refs_used) {
+        for (i = 0; i < nref; i++) {
+            if (c->refs_used[i]) {
+                if (cram_get_ref(fd, i, 1, 0)) {
+                    if (validate_md5(fd, i) < 0)
+                        goto_err;
+                } else {
+                    hts_log_warning("Failed to find reference, "
+                                    "switching to non-ref mode");
+                    no_ref = c->no_ref = 1;
+                }
+            }
+        }
+    }
+
     /* Turn bams into cram_records and gather basic stats */
     for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
         cram_slice *s = c->slices[sn];
@@ -1791,8 +1874,27 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         // Embed consensus / MD-generated ref
         if (embed_ref == 2) {
             if (cram_generate_reference(c, s, r1) < 0) {
-                hts_log_error("Failed to build reference");
-                return -1;
+                // Should this be a permanent thing via fd->no_ref?
+                // Doing so means we cannot easily switch back again should
+                // things fix themselves later on.  This is likely not a
+                // concern though as failure to generate a reference implies
+                // unsorted data which is rarely recovered from.
+
+                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
+                // the first worked, as no-ref is a container global param.
+                if (sn > 0) {
+                    hts_log_error("Failed to build reference, "
+                                  "switching to non-ref mode");
+                    return -1;
+                } else {
+                    hts_log_warning("Failed to build reference, "
+                                    "switching to non-ref mode");
+                }
+                pthread_mutex_lock(&fd->ref_lock);
+                c->embed_ref = fd->embed_ref = 0;
+                c->no_ref = fd->no_ref = 1;
+                pthread_mutex_unlock(&fd->ref_lock);
+                goto restart;
             }
         }
 
@@ -1803,7 +1905,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             bam_seq_t *b = c->bams[r1];
 
             /* If multi-ref we need to cope with changing reference per seq */
-            if (c->multi_seq && !fd->no_ref) {
+            if (c->multi_seq && !no_ref) {
                 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
                     if (c->ref_seq_id >= 0)
                         cram_ref_decr(fd->refs, c->ref_seq_id);
@@ -1813,6 +1915,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                         free(MD.s);
                         return -1;
                     }
+                    if (validate_md5(fd, bam_ref(b)) < 0)
+                        return -1;
 
                     c->ref_seq_id = bam_ref(b); // overwritten later by -2
                     if (!fd->refs->ref_id[c->ref_seq_id]->seq)
@@ -1823,7 +1927,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                 }
             }
 
-            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) {
+            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
+                                 no_ref) != 0) {
                 free(MD.s);
                 return -1;
             }
@@ -1890,7 +1995,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
         }
     }
 
-    if (c->multi_seq && !fd->no_ref) {
+    if (c->multi_seq && !no_ref) {
         if (c->ref_seq_id >= 0)
             cram_ref_decr(fd->refs, c->ref_seq_id);
     }
@@ -1922,12 +2027,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
 
 
     /* Compute MD5s */
+    no_ref = c->no_ref;
     int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
+
     for (i = 0; i < c->curr_slice; i++) {
         cram_slice *s = c->slices[i];
 
         if (CRAM_MAJOR_VERS(fd->version) != 1) {
-            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
+            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
                 hts_md5_context *md5 = hts_md5_init();
                 if (!md5)
                     return -1;
@@ -2304,7 +2411,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     }
 
     /* Cache references up-front if we have unsorted access patterns */
-    if (!fd->no_ref && c->refs_used) {
+    if (!no_ref && c->refs_used) {
         for (i = 0; i < fd->refs->nref; i++) {
             if (c->refs_used[i])
                 cram_ref_decr(fd->refs, i);
@@ -2542,7 +2649,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
                                       cram_slice *s, cram_record *cr,
                                       int verbatim_NM, int verbatim_MD,
                                       int NM, kstring_t *MD, int cf_tag,
-                                      int *err) {
+                                      int no_ref, int *err) {
     char *aux, *orig;
     sam_hrec_rg_t *brg = NULL;
     int aux_size = bam_get_l_aux(b);
@@ -2594,7 +2701,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
 
         // MD:Z
         if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
-            if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
+            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
                 if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
                     while (*aux++);
                     if (CRAM_MAJOR_VERS(fd->version) >= 4)
@@ -2606,7 +2713,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
 
         // NM:i
         if (aux[0] == 'N' && aux[1] == 'M') {
-            if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
+            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
                 int NM_ = bam_aux2i((uint8_t *)aux+2);
                 if (NM_ == NM) {
                     switch(aux[2]) {
@@ -3040,7 +3147,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
                                          fd->slices_per_container);
         if (!c)
             return NULL;
+
+        pthread_mutex_lock(&fd->ref_lock);
+        c->no_ref = fd->no_ref;
+        c->embed_ref = fd->embed_ref;
         c->record_counter = fd->record_counter;
+        pthread_mutex_unlock(&fd->ref_lock);
         c->curr_ref = bam_ref(b);
     }
 
@@ -3087,7 +3199,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
 static int process_one_read(cram_fd *fd, cram_container *c,
                             cram_slice *s, cram_record *cr,
                             bam_seq_t *b, int rnum, kstring_t *MD,
-                            int embed_ref) {
+                            int embed_ref, int no_ref) {
     int i, fake_qual = -1, NM = 0;
     char *cp;
     char *ref, *seq, *qual;
@@ -3128,7 +3240,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     // Non reference based encoding means storing the bases verbatim as features, which in
     // turn means every base also has a quality already stored.
-    if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
+    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
         cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
 
     if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
@@ -3206,7 +3318,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
                 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
                 //      cig_len, &ref[apos], cig_len, &seq[spos]);
                 l = 0;
-                if (!fd->no_ref && cr->len) {
+                if (!no_ref && cr->len) {
                     int end = cig_len+apos < c->ref_end
                         ? cig_len : c->ref_end - apos;
                     char *sp = &seq[spos];
@@ -3301,7 +3413,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
                 }
 
                 if (l < cig_len && cr->len) {
-                    if (fd->no_ref) {
+                    if (no_ref) {
                         if (CRAM_MAJOR_VERS(fd->version) == 3) {
                             if (cram_add_bases(fd, c, s, cr, spos,
                                                cig_len-l, &seq[spos]))
@@ -3360,7 +3472,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
                 if (cram_add_insertion(c, s, cr, spos, cig_len,
                                        cr->len ? &seq[spos] : NULL))
                     return -1;
-                if (fd->no_ref && cr->len) {
+                if (no_ref && cr->len) {
                     for (l = 0; l < cig_len; l++, spos++) {
                         cram_add_quality(fd, c, s, cr, spos, qual[spos]);
                     }
@@ -3376,7 +3488,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
                                       fd->version))
                     return -1;
 
-                if (fd->no_ref &&
+                if (no_ref &&
                     !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
                     if (cr->len) {
                         for (l = 0; l < cig_len; l++, spos++) {
@@ -3412,7 +3524,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
             return -1;
         }
         fake_qual = spos;
-        cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
+        cr->aend = no_ref ? apos : MIN(apos, c->ref_end);
         if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
             goto block_err;
 
@@ -3435,7 +3547,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
     int err = 0;
     sam_hrec_rg_t *brg =
         cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
-                        cf_tag, &err);
+                        cf_tag, no_ref, &err);
     if (err)
         goto block_err;
 
@@ -3728,9 +3840,16 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
         if (!fd->ctr)
             return -1;
         fd->ctr->record_counter = fd->record_counter;
+
+        pthread_mutex_lock(&fd->ref_lock);
+        fd->ctr->no_ref = fd->no_ref;
+        fd->ctr->embed_ref = fd->embed_ref;
+        pthread_mutex_unlock(&fd->ref_lock);
     }
     c = fd->ctr;
 
+    int embed_ref = c->embed_ref;
+
     if (!c->slice || c->curr_rec == c->max_rec ||
         (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
         (c->s_num_bases >= fd->bases_per_slice)) {
@@ -3746,9 +3865,6 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
          * The multi_seq var here refers to our intention for the next slice.
          * This slice has already been encoded so we output as-is.
          */
-        pthread_mutex_lock(&fd->ref_lock);
-        int embed_ref = fd->embed_ref;
-        pthread_mutex_unlock(&fd->ref_lock);
         if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
             fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
             embed_ref<=0) {
@@ -3773,7 +3889,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
             if (NULL == (c = cram_next_container(fd, b))) {
                 if (fd->ctr) {
                     // prevent cram_close attempting to flush
-                    cram_free_container(fd->ctr);
+                    fd->ctr_mt = fd->ctr; // delay free when threading
                     fd->ctr = NULL;
                 }
                 return -1;
@@ -3796,6 +3912,27 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
             c->multi_seq = 1;
             c->pos_sorted = 0;
 
+            // Cram_next_container may end up flushing an existing one and
+            // triggering fd->embed_ref=2 if no reference is found.
+            // Embedded refs are incompatible with multi-seq, so we bail
+            // out and switch to no_ref in this scenario.  We do this
+            // within the container only, as multi_seq may be temporary
+            // and we switch back away from it again.
+            pthread_mutex_lock(&fd->ref_lock);
+            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
+                hts_log_warning("Changing from embed_ref to no_ref mode");
+                // Should we update fd->embed_ref and no_ref here too?
+                // Doing so means if we go into multi-seq and back out
+                // again, eg due a cluster of tiny refs in the middle of
+                // much larger ones, then we bake in no-ref mode.
+                //
+                // However for unsorted data we're realistically not
+                // going to switch back.
+                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
+                c->no_ref = fd->no_ref = 1;
+            }
+            pthread_mutex_unlock(&fd->ref_lock);
+
             if (!c->refs_used) {
                 pthread_mutex_lock(&fd->ref_lock);
                 c->refs_used = calloc(fd->refs->nref, sizeof(int));
@@ -3821,8 +3958,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
             } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
                 pthread_mutex_lock(&fd->ref_lock);
                 fd->unsorted = 1;
-                pthread_mutex_unlock(&fd->ref_lock);
                 fd->multi_seq = 1;
+                pthread_mutex_unlock(&fd->ref_lock);
             }
         }
 
diff --git a/cram/cram_external.c b/cram/cram_external.c
index e88ff838b..26ef3d7d3 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd.
+Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
+#include <stdint.h>
+
+#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
+#include <htscodecs/rANS_static4x16.h.h>
+#else
+#include "../htscodecs/htscodecs/rANS_static4x16.h"
+#endif
 
 #include "../htslib/hfile.h"
 #include "cram.h"
@@ -82,6 +89,14 @@ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
     c->num_blocks = num_blocks;
 }
 
+int32_t cram_container_get_num_records(cram_container *c) {
+    return c->num_records;
+}
+
+int64_t cram_container_get_num_bases(cram_container *c) {
+    return c->num_bases;
+}
+
 
 /* Returns the landmarks[] array and the number of elements
  * in num_landmarks.
@@ -180,6 +195,294 @@ int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
     return 0;
 }
 
+typedef struct {
+    cram_block_compression_hdr *hdr;
+    cram_map *curr_map;
+    int idx;
+    int is_tag; // phase 2 using tag_encoding_map
+} cram_codec_iter;
+
+static void cram_codec_iter_init(cram_block_compression_hdr *hdr,
+                                 cram_codec_iter *iter) {
+    iter->hdr = hdr;
+    iter->curr_map = NULL;
+    iter->idx = 0;
+    iter->is_tag = 0;
+}
+
+// See enum cram_DS_ID in cram/cram_structs
+static int cram_ds_to_key(enum cram_DS_ID ds) {
+    switch(ds) {
+    case DS_RN: return 256*'R'+'N';
+    case DS_QS: return 256*'Q'+'S';
+    case DS_IN: return 256*'I'+'N';
+    case DS_SC: return 256*'S'+'C';
+    case DS_BF: return 256*'B'+'F';
+    case DS_CF: return 256*'C'+'F';
+    case DS_AP: return 256*'A'+'P';
+    case DS_RG: return 256*'R'+'G';
+    case DS_MQ: return 256*'M'+'Q';
+    case DS_NS: return 256*'N'+'S';
+    case DS_MF: return 256*'M'+'F';
+    case DS_TS: return 256*'T'+'S';
+    case DS_NP: return 256*'N'+'P';
+    case DS_NF: return 256*'N'+'F';
+    case DS_RL: return 256*'R'+'L';
+    case DS_FN: return 256*'F'+'N';
+    case DS_FC: return 256*'F'+'C';
+    case DS_FP: return 256*'F'+'P';
+    case DS_DL: return 256*'D'+'L';
+    case DS_BA: return 256*'B'+'A';
+    case DS_BS: return 256*'B'+'S';
+    case DS_TL: return 256*'T'+'L';
+    case DS_RI: return 256*'R'+'I';
+    case DS_RS: return 256*'R'+'S';
+    case DS_PD: return 256*'P'+'D';
+    case DS_HC: return 256*'H'+'C';
+    case DS_BB: return 256*'B'+'B';
+    case DS_QQ: return 256*'Q'+'Q';
+    case DS_TN: return 256*'T'+'N';
+    case DS_TC: return 256*'T'+'C';
+    case DS_TM: return 256*'T'+'M';
+    case DS_TV: return 256*'T'+'V';
+    default: break;
+    }
+
+    return -1; // unknown
+}
+
+static cram_codec *cram_codec_iter_next(cram_codec_iter *iter,
+                                        int *key) {
+    cram_codec *cc = NULL;
+    cram_block_compression_hdr *hdr = iter->hdr;
+
+    if (!iter->is_tag) {
+        // 1: Iterating through main data-series
+        do {
+            cc = hdr->codecs[iter->idx++];
+        } while(!cc && iter->idx < DS_END);
+        if (cc) {
+            *key = cram_ds_to_key(iter->idx-1);
+            return cc;
+        }
+
+        // Reset index for phase 2
+        iter->idx = 0;
+        iter->is_tag = 1;
+    }
+
+    do {
+        if (!iter->curr_map)
+            iter->curr_map = hdr->tag_encoding_map[iter->idx++];
+
+        cc = iter->curr_map ? iter->curr_map->codec : NULL;
+        if (cc) {
+            *key = iter->curr_map->key;
+            iter->curr_map = iter->curr_map->next;
+            return cc;
+        }
+    } while (iter->idx <= CRAM_MAP_HASH);
+
+    // End of codecs
+    return NULL;
+}
+
+/*
+ * A list of data-series, used to create a linked list threaded through
+ * a single array.
+ */
+typedef struct ds_list {
+    int data_series;
+    int next;
+} ds_list;
+
+KHASH_MAP_INIT_INT(cid, int64_t)
+
+// Opaque struct for the CRAM block content-id -> data-series map.
+struct cram_cid2ds_t {
+    ds_list *ds;        // array of data-series with linked lists threading through it
+    int ds_size;
+    int ds_idx;
+    khash_t(cid) *hash; // key=content_id,  value=index to ds array
+    int *ds_a;          // serialised array of data-series returned by queries.
+};
+
+void cram_cid2ds_free(cram_cid2ds_t *cid2ds) {
+    if (cid2ds) {
+        if (cid2ds->hash)
+            kh_destroy(cid, cid2ds->hash);
+        free(cid2ds->ds);
+        free(cid2ds->ds_a);
+        free(cid2ds);
+    }
+}
+
+/*
+ * Map cram block numbers to data-series.  It's normally a 1:1 mapping,
+ * but in rare cases it can be 1:many (or even many:many).
+ * The key is the block number and the value is an index into the data-series
+ * array, which we iterate over until reaching a negative value.
+ *
+ * Provide cid2ds as NULL to allocate a new map or pass in an existing one
+ * to append to this map.  The new (or existing) map is returned.
+ *
+ * Returns the cid2ds (newly allocated or as provided) on success,
+ *         NULL on failure.
+ */
+cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr,
+                                      cram_cid2ds_t *cid2ds) {
+    cram_cid2ds_t *c2d = cid2ds;
+    if (!c2d) {
+        c2d = calloc(1, sizeof(*c2d));
+        if (!c2d)
+            return NULL;
+
+        c2d->hash = kh_init(cid);
+        if (!c2d->hash)
+            goto err;
+    }
+
+    // Iterate through codecs.  Initially primary two-left ones in
+    // rec_encoding_map, and then the three letter in tag_encoding_map.
+    cram_codec_iter citer;
+    cram_codec_iter_init(hdr, &citer);
+    cram_codec *codec;
+    int key;
+
+    while ((codec = cram_codec_iter_next(&citer, &key))) {
+        // Having got a codec, we can then use cram_codec_to_id to get
+        // the block IDs utilised by that codec.  This is then our
+        // map for allocating data blocks to data series, but for shared
+        // blocks we can't separate out how much is used by each DS.
+        int bnum[2];
+        cram_codec_get_content_ids(codec, bnum);
+
+        khiter_t k;
+        int ret, i;
+        for (i = 0; i < 2; i++) {
+            if (bnum[i] > -2) {
+                k = kh_put(cid, c2d->hash, bnum[i], &ret);
+                if (ret < 0)
+                    goto err;
+
+                if (c2d->ds_idx >= c2d->ds_size) {
+                    c2d->ds_size += 100;
+                    c2d->ds_size *= 2;
+                    ds_list *ds_new = realloc(c2d->ds,
+                                              c2d->ds_size * sizeof(*ds_new));
+                    if (!ds_new)
+                        goto err;
+                    c2d->ds = ds_new;
+                }
+
+                if (ret == 0) {
+                    // Shared content_id, so add to list of DS
+
+                    // Maybe data-series should be part of the hash key?
+                    //
+                    // So top-32 bit is content-id, bot-32 bit is key.
+                    // Sort hash by key and then can group all the data-series
+                    // known together. ??
+                    //
+                    // Brute force for now, scan to see if recorded.
+                    // Typically this is minimal effort as we almost always
+                    // have 1 data-series per block content-id, so the list to
+                    // search is of size 1.
+                    int dsi = kh_value(c2d->hash, k);
+                    while (dsi >= 0) {
+                        if (c2d->ds[dsi].data_series == key)
+                            break;
+                        dsi = c2d->ds[dsi].next;
+                    }
+
+                    if (dsi == -1) {
+                        // Block content_id seen before, but not with this DS
+                        c2d->ds[c2d->ds_idx].data_series = key;
+                        c2d->ds[c2d->ds_idx].next = kh_value(c2d->hash, k);
+                        kh_value(c2d->hash, k) = c2d->ds_idx;
+                        c2d->ds_idx++;
+                    }
+                } else {
+                    // First time this content id has been used
+                    c2d->ds[c2d->ds_idx].data_series = key;
+                    c2d->ds[c2d->ds_idx].next = -1;
+                    kh_value(c2d->hash, k) = c2d->ds_idx;
+                    c2d->ds_idx++;
+                }
+            }
+        }
+    }
+
+    return c2d;
+
+ err:
+    if (c2d != cid2ds)
+        cram_cid2ds_free(c2d);
+    return NULL;
+}
+
+/*
+ * Return a list of data series observed as belonging to a block with
+ * the specified content_id.  *n is the number of data series
+ * returned, or 0 if block is unused.
+ * Block content_id of -1 is used to indicate the CORE block.
+ *
+ * The pointer returned is owned by the cram_cid2ds state and should
+ * not be freed by the caller.
+ */
+int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n) {
+    *n = 0;
+    if (!c2d || !c2d->hash)
+        return NULL;
+
+    khiter_t k = kh_get(cid, c2d->hash, content_id);
+    if (k == kh_end(c2d->hash))
+        return NULL;
+
+    if (!c2d->ds_a) {
+        c2d->ds_a = malloc(c2d->ds_idx * sizeof(int));
+        if (!c2d->ds_a)
+            return NULL;
+    }
+
+    int dsi = kh_value(c2d->hash, k); // initial ds array index from hash
+    int idx = 0;
+    while (dsi >= 0) {
+        c2d->ds_a[idx++] = c2d->ds[dsi].data_series;
+        dsi = c2d->ds[dsi].next;      // iterate over list within ds array
+    }
+
+    *n = idx;
+    return c2d->ds_a;
+}
+
+/*
+ * Produces a description of the record and tag encodings held within
+ * a compression header and appends to 'ks'.
+ *
+ * Returns 0 on success,
+ *        <0 on failure.
+ */
+int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks) {
+    cram_codec_iter citer;
+    cram_codec_iter_init(hdr, &citer);
+    cram_codec *codec;
+    int key, r = 0;
+
+    while ((codec = cram_codec_iter_next(&citer, &key))) {
+        char key_s[4] = {0};
+        int key_i = 0;
+        if (key>>16) key_s[key_i++] = key>>16;
+        key_s[key_i++] = (key>>8)&0xff;
+        key_s[key_i++] = key&0xff;
+        r |= ksprintf(ks, "\t%s\t", key_s) < 0;
+        r |= cram_codec_describe(codec, ks) < 0;
+        r |= kputc('\n', ks) < 0;
+    }
+
+    return r ? -1 : 0;
+}
+
 /*
  *-----------------------------------------------------------------------------
  * cram_slice
@@ -206,12 +509,17 @@ void cram_slice_hdr_get_coords(cram_block_slice_hdr *h,
  *-----------------------------------------------------------------------------
  * cram_block
  */
-int32_t cram_block_get_content_id(cram_block *b)  { return b->content_id; }
+int32_t cram_block_get_content_id(cram_block *b)  {
+    return b->content_type == CORE ? -1 : b->content_id;
+}
 int32_t cram_block_get_comp_size(cram_block *b)   { return b->comp_size; }
 int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
 int32_t cram_block_get_crc32(cram_block *b)       { return b->crc32; }
 void *  cram_block_get_data(cram_block *b)        { return BLOCK_DATA(b); }
 int32_t cram_block_get_size(cram_block *b)        { return BLOCK_SIZE(b); }
+enum cram_block_method cram_block_get_method(cram_block *b) {
+    return (enum cram_block_method)b->orig_method;
+}
 enum cram_content_type cram_block_get_content_type(cram_block *b) {
     return b->content_type;
 }
@@ -236,6 +544,122 @@ void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
 size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
 void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
 
+/*
+ * Given a compressed block of data in a specified compression method,
+ * fill out the 'cm' field with meta-data gleaned from the compressed
+ * block.
+ *
+ * If comp is CRAM_COMP_UNKNOWN, we attempt to auto-detect the compression
+ * format, but this doesn't work for all methods.
+ *
+ * Retuns the detected or specified comp method, and fills out *cm
+ * if non-NULL.
+ */
+cram_method_details *cram_expand_method(uint8_t *data, int32_t size,
+                                        enum cram_block_method comp) {
+    cram_method_details *cm = calloc(1, sizeof(*cm));
+    if (!cm)
+        return NULL;
+
+    const char *xz_header = "\xFD""7zXZ"; // including nul
+
+    if (comp == CRAM_COMP_UNKNOWN) {
+        // Auto-detect
+        if (size > 1 && data[0] == 0x1f && data[1] == 0x8b)
+            comp = CRAM_COMP_GZIP;
+        else if (size > 3 && data[1] == 'B' && data[2] == 'Z'
+                 && data[3] == 'h')
+            comp = CRAM_COMP_BZIP2;
+        else if (size > 6 && memcmp(xz_header, data, 6) == 0)
+            comp = CRAM_COMP_LZMA;
+        else
+            comp = CRAM_COMP_UNKNOWN;
+    }
+    cm->method = comp;
+
+    // Interrogate the compressed data stream to fill out additional fields.
+    switch (comp) {
+    case CRAM_COMP_GZIP:
+        if (size > 8) {
+            if (data[8] == 4)
+                cm->level = 1;
+            else if (data[8] == 2)
+                cm->level = 9;
+            else
+                cm->level = 5;
+        }
+        break;
+
+    case CRAM_COMP_BZIP2:
+        if (size > 3 && data[3] >= '1' && data[3] <= '9')
+            cm->level = data[3]-'0';
+        break;
+
+    case CRAM_COMP_RANS4x8:
+        cm->Nway = 4;
+        if (size > 0 && data[0] == 1)
+            cm->order = 1;
+        else
+            cm->order = 0;
+        break;
+
+    case CRAM_COMP_RANSNx16:
+        if (size > 0) {
+            cm->order  = data[0] & 1;
+            cm->Nway   = data[0] & RANS_ORDER_X32    ? 32 : 4;
+            cm->rle    = data[0] & RANS_ORDER_RLE    ?  1 : 0;
+            cm->pack   = data[0] & RANS_ORDER_PACK   ?  1 : 0;
+            cm->cat    = data[0] & RANS_ORDER_CAT    ?  1 : 0;
+            cm->stripe = data[0] & RANS_ORDER_STRIPE ?  1 : 0;
+            cm->nosz   = data[0] & RANS_ORDER_NOSZ   ?  1 : 0;
+        }
+        break;
+
+    case CRAM_COMP_ARITH:
+        if (size > 0) {
+            // Not in a public header, but the same transforms as rANSNx16
+            cm->order  = data[0] & 3;
+            cm->rle    = data[0] & RANS_ORDER_RLE    ?  1 : 0;
+            cm->pack   = data[0] & RANS_ORDER_PACK   ?  1 : 0;
+            cm->cat    = data[0] & RANS_ORDER_CAT    ?  1 : 0;
+            cm->stripe = data[0] & RANS_ORDER_STRIPE ?  1 : 0;
+            cm->nosz   = data[0] & RANS_ORDER_NOSZ   ?  1 : 0;
+            cm->ext    = data[0] & 4 /*external*/    ?  1 : 0;
+        }
+        break;
+
+    case CRAM_COMP_TOK3:
+        if (size > 8) {
+            if (data[8] == 1)
+                cm->level = 11;
+            else if (data[8] == 0)
+                cm->level = 1;
+        }
+        break;
+
+    default:
+        break;
+    }
+
+    return cm;
+}
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_codecs
+ */
+
+// -2 is unused.
+// -1 is CORE
+// >= 0 is the block with that Content ID
+void cram_codec_get_content_ids(cram_codec *c, int ids[2]) {
+    ids[0] = cram_codec_to_id(c, &ids[1]);
+}
+
+/*
+ *-----------------------------------------------------------------------------
+ * Utility functions
+ */
 
 /*
  * Copies the blocks representing the next num_slice slices from a
diff --git a/cram/cram_io.c b/cram/cram_io.c
index 5d01e1318..d3c39e47a 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2022 Genome Research Ltd.
+Copyright (c) 2012-2023 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -2183,6 +2183,8 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s,
                 case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
                 case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
                 case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
+                case TOK3:     strat = 0; break;
+                case TOKA:     strat = 1; break;
                 default:       strat = 0;
                 }
                 metrics->strat  = strat;
@@ -2586,6 +2588,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
         e->seq = NULL;
         e->mf = NULL;
         e->is_md5 = 0;
+        e->validated_md5 = 0;
 
         k = kh_put(refs, r->h_meta, e->name, &n);
         if (-1 == n)  {
@@ -3022,6 +3025,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
             fd->refs->fp = fp;
             fd->refs->fn = r->fn;
             r->is_md5 = 1;
+            r->validated_md5 = 1;
 
             // Fall back to cram_get_ref() where it'll do the actual
             // reading of the file.
@@ -3043,6 +3047,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
         }
         r->length = sz;
         r->is_md5 = 1;
+        r->validated_md5 = 1;
     } else {
         refs_t *refs;
         const char *fn;
@@ -3198,7 +3203,7 @@ void cram_ref_decr(refs_t *r, int id) {
 }
 
 /*
- * Used by cram_ref_load and cram_ref_get. The file handle will have
+ * Used by cram_ref_load and cram_get_ref. The file handle will have
  * already been opened, so we can catch it. The ref_entry *e informs us
  * of whether this is a multi-line fasta file or a raw MD5 style file.
  * Either way we create a single contiguous sequence.
@@ -3216,6 +3221,10 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
     /*
      * Compute locations in file. This is trivial for the MD5 files, but
      * is still necessary for the fasta variants.
+     *
+     * Note the offset here, as with faidx, has the assumption that white-
+     * space (the diff between line_length and bases_per_line) only occurs
+     * at the end of a line of text.
      */
     offset = e->line_length
         ? e->offset + (start-1)/e->bases_per_line * e->line_length +
@@ -3244,14 +3253,34 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
 
     /* Strip white-space if required. */
     if (len != end-start+1) {
-        int i, j;
+        hts_pos_t i, j;
         char *cp = seq;
         char *cp_to;
 
+        // Copy up to the first white-space, and then repeatedly just copy
+        // bases_per_line verbatim, and use the slow method to end again.
+        //
+        // This may seem excessive, but this code can be a significant
+        // portion of total CRAM decode CPU time for shallow data sets.
         for (i = j = 0; i < len; i++) {
-            if (cp[i] >= '!' && cp[i] <= '~')
-                cp[j++] = toupper_c(cp[i]);
+            if (!isspace_c(cp[i]))
+                cp[j++] = cp[i] & ~0x20;
+            else
+                break;
+        }
+        while (i < len && isspace_c(cp[i]))
+            i++;
+        while (i < len - e->line_length) {
+            hts_pos_t j_end = j + e->bases_per_line;
+            while (j < j_end)
+                cp[j++] = cp[i++] & ~0x20; // toupper equiv
+            i += e->line_length - e->bases_per_line;
         }
+        for (; i < len; i++) {
+            if (!isspace_c(cp[i]))
+                cp[j++] = cp[i] & ~0x20;
+        }
+
         cp_to = cp+j;
 
         if (cp_to - seq != end-start+1) {
@@ -3422,8 +3451,11 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
      */
     pthread_mutex_lock(&fd->refs->lock);
     if (r->length == 0) {
+        if (fd->ref_fn)
+            hts_log_warning("Reference file given, but ref '%s' not present",
+                            r->name);
         if (cram_populate_ref(fd, id, r) == -1) {
-            hts_log_error("Failed to populate reference for id %d", id);
+            hts_log_warning("Failed to populate reference for id %d", id);
             pthread_mutex_unlock(&fd->refs->lock);
             pthread_mutex_unlock(&fd->ref_lock);
             return NULL;
@@ -3620,6 +3652,8 @@ cram_container *cram_new_container(int nrec, int nslice) {
     c->max_apos   = 0;
     c->multi_seq  = 0;
     c->qs_seq_orient = 1;
+    c->no_ref = 0;
+    c->embed_ref = -1; // automatic selection
 
     c->bams = NULL;
 
@@ -4823,6 +4857,11 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                 return -1;
     }
 
+    if (-1 == refs_from_header(fd))
+        return -1;
+    if (-1 == refs2id(fd->refs, fd->header))
+        return -1;
+
     /* Fix M5 strings */
     if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
         int i;
@@ -4870,6 +4909,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                 cram_ref_decr(fd->refs, i);
 
                 hts_md5_hex(buf2, buf);
+                fd->refs->ref_id[i]->validated_md5 = 1;
                 if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
                     return -1;
             }
@@ -4997,11 +5037,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
         cram_free_container(c);
     }
 
-    if (-1 == refs_from_header(fd))
-        return -1;
-    if (-1 == refs2id(fd->refs, fd->header))
-        return -1;
-
     if (0 != hflush(fd->fp))
         return -1;
 
diff --git a/cram/cram_structs.h b/cram/cram_structs.h
index 16739c2c6..1ee4b9e85 100644
--- a/cram/cram_structs.h
+++ b/cram/cram_structs.h
@@ -88,7 +88,7 @@ struct hFILE;
 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
 #define SLICE_PER_CNT  1
 
-#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
+#define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT"
 
 #define MAX_STAT_VAL 1024
 //#define MAX_STAT_VAL 16
@@ -457,6 +457,8 @@ struct cram_container {
     /* Copied from fd before encoding, to allow multi-threading */
     int ref_start, first_base, last_base, ref_id, ref_end;
     char *ref;
+    int embed_ref;               // 1 if embedding ref, 2 if embedding cons
+    int no_ref;                  // true if referenceless
     //struct ref_entry *ref;
 
     /* For multi-threading */
@@ -675,6 +677,7 @@ typedef struct ref_entry {
     char *seq;
     mFILE *mf;
     int is_md5;            // Reference comes from a raw seq found by MD5
+    int validated_md5;
 } ref_entry;
 
 KHASH_MAP_INIT_STR(refs, ref_entry*)
@@ -792,14 +795,14 @@ struct cram_fd {
     cram_container *ctr_mt;
 
     // positions for encoding or decoding
-    int first_base, last_base;
+    int first_base, last_base; // copied to container
 
     // cached reference portion
     refs_t *refs;              // ref meta-data structure
     char *ref, *ref_free;      // current portion held in memory
-    int   ref_id;
-    int   ref_start;
-    int   ref_end;
+    int   ref_id;              // copied to container
+    int   ref_start;           // copied to container
+    int   ref_end;             // copied to container
     char *ref_fn;   // reference fasta filename
 
     // compression level and metrics
@@ -812,8 +815,8 @@ struct cram_fd {
     int seqs_per_slice;
     int bases_per_slice;
     int slices_per_container;
-    int embed_ref;
-    int no_ref;
+    int embed_ref; // copied to container
+    int no_ref;    // copied to container
     int ignore_md5;
     int use_bz2;
     int use_rans;
diff --git a/faidx.c b/faidx.c
index f3be5e57c..2eb0f3edc 100644
--- a/faidx.c
+++ b/faidx.c
@@ -1,6 +1,6 @@
 /*  faidx.c -- FASTA and FASTQ random access.
 
-    Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -702,6 +702,12 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
         return NULL;
     }
 
+    if (val->line_blen <= 0) {
+        hts_log_error("Invalid line length in index: %d", val->line_blen);
+        *len = -1;
+        return NULL;
+    }
+
     ret = bgzf_useek(fai->bgzf,
                      offset
                      + beg / val->line_blen * val->line_len
@@ -766,6 +772,22 @@ static int fai_get_val(const faidx_t *fai, const char *str,
     return 0;
 }
 
+/*
+ *  The internal still has line_blen as uint32_t, but our references
+ *  can be longer, so for future proofing we use hts_pos_t.  We also needed
+ *  a signed value so we can return negatives as an error.
+ */
+hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
+{
+    faidx1_t val;
+    int64_t beg, end;
+    hts_pos_t len;
+
+    if (fai_get_val(fai, str, &len, &val, &beg, &end))
+        return -1;
+    else
+        return val.line_blen;
+}
 
 char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
 {
@@ -822,26 +844,40 @@ const char *faidx_iseq(const faidx_t *fai, int i)
     return fai->name[i];
 }
 
-int faidx_seq_len(const faidx_t *fai, const char *seq)
+hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
 {
     khint_t k = kh_get(s, fai->hash, seq);
     if ( k == kh_end(fai->hash) ) return -1;
     return kh_val(fai->hash, k).len;
 }
 
-static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) {
+int faidx_seq_len(const faidx_t *fai, const char *seq)
+{
+    hts_pos_t len = faidx_seq_len64(fai, seq);
+    return len < INT_MAX ? len : INT_MAX;
+}
+
+static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
+                                 faidx1_t *val_out, const char *c_name,
+                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
+                                 hts_pos_t *len) {
     khiter_t iter;
+    faidx1_t *val;
 
     // Adjust position
     iter = kh_get(s, fai->hash, c_name);
 
     if (iter == kh_end(fai->hash)) {
-        *len = -2;
+        if (len)
+            *len = -2;
         hts_log_error("The sequence \"%s\" was not found", c_name);
         return 1;
     }
 
-    *val = kh_value(fai->hash, iter);
+    val = &kh_value(fai->hash, iter);
+
+    if (val_out)
+        *val_out = *val;
 
     if(*p_end_i < *p_beg_i)
         *p_beg_i = *p_end_i;
@@ -849,22 +885,42 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *
     if(*p_beg_i < 0)
         *p_beg_i = 0;
     else if(val->len <= *p_beg_i)
-        *p_beg_i = val->len - 1;
+        *p_beg_i = val->len;
 
     if(*p_end_i < 0)
         *p_end_i = 0;
     else if(val->len <= *p_end_i)
-        *p_end_i = val->len - 1;
+        *p_end_i = val->len - end_adjust;
 
     return 0;
 }
 
+int fai_adjust_region(const faidx_t *fai, int tid,
+                      hts_pos_t *beg, hts_pos_t *end)
+{
+    hts_pos_t orig_beg, orig_end;
+
+    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
+        return -1;
+
+    orig_beg = *beg;
+    orig_end = *end;
+    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
+        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
+                      fai->name[tid]);
+        return -1;
+    }
+
+    return ((orig_beg != *beg ? 1 : 0) |
+            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
+}
+
 char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
 {
     faidx1_t val;
 
     // Adjust position
-    if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) {
+    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
         return NULL;
     }
 
@@ -885,7 +941,7 @@ char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg
     faidx1_t val;
 
     // Adjust position
-    if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) {
+    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
         return NULL;
     }
 
diff --git a/hts.c b/hts.c
index 8b437f2b9..cead9d537 100644
--- a/hts.c
+++ b/hts.c
@@ -1,6 +1,6 @@
 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
 
-    Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd.
     Copyright (C) 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -1934,8 +1934,9 @@ char **hts_readlist(const char *string, int is_file, int *_n)
         if ( !fp ) return NULL;
 
         kstring_t str;
+        int ret;
         str.s = 0; str.l = str.m = 0;
-        while (bgzf_getline(fp, '\n', &str) >= 0)
+        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
         {
             if (str.l == 0) continue;
             if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
@@ -1945,6 +1946,8 @@ char **hts_readlist(const char *string, int is_file, int *_n)
                 goto err;
             n++;
         }
+        if (ret < -1) // Read error
+            goto err;
         bgzf_close(fp);
         free(str.s);
     }
@@ -1991,8 +1994,9 @@ char **hts_readlines(const char *fn, int *_n)
     BGZF *fp = bgzf_open(fn, "r");
     if ( fp ) { // read from file
         kstring_t str;
+        int ret;
         str.s = 0; str.l = str.m = 0;
-        while (bgzf_getline(fp, '\n', &str) >= 0) {
+        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
             if (str.l == 0) continue;
             if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
                 goto err;
@@ -2001,6 +2005,8 @@ char **hts_readlines(const char *fn, int *_n)
                 goto err;
             n++;
         }
+        if (ret < -1) // Read error
+            goto err;
         bgzf_close(fp);
         free(str.s);
     } else if (*fn == ':') { // read from string
@@ -2354,9 +2360,9 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
         return 0;
 
     if (idx->fmt == HTS_FMT_CSI) {
-        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
-                      " cannot be stored in a csi index. "
-                      "Please check headers match the data",
+        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
+                      "cannot be stored in a csi index with these parameters. "
+                      "Please use a larger min_shift or depth",
                       beg, end);
     } else {
         hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
@@ -3077,16 +3083,14 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
                 free(iter);
                 iter = NULL;
             }
+        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
+            iter->finished = 1;
         } else {
             if (beg < 0) beg = 0;
             if (end < beg) {
               free(iter);
               return NULL;
             }
-            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
-              free(iter);
-              return NULL;
-            }
 
             k = kh_get(bin, bidx, META_BIN(idx));
             if (k != kh_end(bidx))
diff --git a/htscodecs b/htscodecs
index 3ef17f6fb..cd0737fff 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8
+Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c
diff --git a/htsfile.1 b/htsfile.1
index cf87bef3d..9e0b0b603 100644
--- a/htsfile.1
+++ b/htsfile.1
@@ -1,4 +1,4 @@
-.TH htsfile 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools"
+.TH htsfile 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools"
 .SH NAME
 htsfile \- identify high-throughput sequencing data files
 .\"
diff --git a/htsfile.c b/htsfile.c
index 0391a98fa..9f7bf4531 100644
--- a/htsfile.c
+++ b/htsfile.c
@@ -258,7 +258,7 @@ int main(int argc, char **argv)
         case 1:
             printf(
 "htsfile (htslib) %s\n"
-"Copyright (C) 2022 Genome Research Ltd.\n",
+"Copyright (C) 2023 Genome Research Ltd.\n",
                    hts_version());
             exit(EXIT_SUCCESS);
             break;
diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7
index 9f77cb585..b37eacac3 100644
--- a/htslib-s3-plugin.7
+++ b/htslib-s3-plugin.7
@@ -1,4 +1,4 @@
-.TH htslib-s3-plugin 7 "18 August 2022" "htslib-1.16" "Bioinformatics tools"
+.TH htslib-s3-plugin 7 "21 February 2023" "htslib-1.17" "Bioinformatics tools"
 .SH NAME
 s3 plugin \- htslib AWS S3 plugin
 .\"
@@ -207,8 +207,8 @@ addressing_style or host_bucket.  The first two can be set to \fBpath\fR while
 host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string.
 
 .SH "SEE ALSO"
-.BR htsfile (1)
-.BR samtools (1)
+.IR htsfile (1)
+.IR samtools (1)
 .PP
 RFC 3339: <https://www.rfc-editor.org/rfc/rfc3339#section-5.6>
 .PP
diff --git a/htslib.map b/htslib.map
new file mode 100644
index 000000000..11411a6e3
--- /dev/null
+++ b/htslib.map
@@ -0,0 +1,632 @@
+HTSLIB_1.0 {
+    bam_aux2A;
+    bam_aux2Z;
+    bam_aux2f;
+    bam_aux2i;
+    bam_aux_append;
+    bam_aux_del;
+    bam_aux_get;
+    bam_cigar2qlen;
+    bam_cigar2rlen;
+    bam_copy1;
+    bam_destroy1;
+    bam_dup1;
+    bam_endpos;
+    bam_flag2str;
+    bam_hdr_read;
+    bam_hdr_write;
+    bam_init1;
+    bam_mplp_auto;
+    bam_mplp_destroy;
+    bam_mplp_init;
+    bam_mplp_init_overlaps;
+    bam_mplp_set_maxcnt;
+    bam_plp_auto;
+    bam_plp_destroy;
+    bam_plp_init;
+    bam_plp_next;
+    bam_plp_push;
+    bam_plp_reset;
+    bam_plp_set_maxcnt;
+    bam_read1;
+    bam_str2flag;
+    bam_write1;
+    bcf_add_filter;
+    bcf_calc_ac;
+    bcf_clear;
+    bcf_destroy;
+    bcf_dup;
+    bcf_enc_vchar;
+    bcf_enc_vfloat;
+    bcf_enc_vint;
+    bcf_float_missing;
+    bcf_float_vector_end;
+    bcf_fmt_array;
+    bcf_fmt_sized_array;
+    bcf_get_fmt;
+    bcf_get_format_string;
+    bcf_get_format_values;
+    bcf_get_info;
+    bcf_get_info_values;
+    bcf_get_variant_type;
+    bcf_get_variant_types;
+    bcf_gt_type;
+    bcf_has_filter;
+    bcf_hdr_add_hrec;
+    bcf_hdr_add_sample;
+    bcf_hdr_append;
+    bcf_hdr_combine;
+    bcf_hdr_destroy;
+    bcf_hdr_dup;
+    bcf_hdr_fmt_text;
+    bcf_hdr_get_hrec;
+    bcf_hdr_get_version;
+    bcf_hdr_id2int;
+    bcf_hdr_init;
+    bcf_hdr_parse;
+    bcf_hdr_parse_line;
+    bcf_hdr_printf;
+    bcf_hdr_read;
+    bcf_hdr_remove;
+    bcf_hdr_seqnames;
+    bcf_hdr_set;
+    bcf_hdr_set_samples;
+    bcf_hdr_set_version;
+    bcf_hdr_subset;
+    bcf_hdr_sync;
+    bcf_hdr_write;
+    bcf_hrec_add_key;
+    bcf_hrec_destroy;
+    bcf_hrec_dup;
+    bcf_hrec_find_key;
+    bcf_hrec_format;
+    bcf_hrec_set_val;
+    bcf_index_build;
+    bcf_init;
+    bcf_is_snp;
+    bcf_read;
+    bcf_readrec;
+    bcf_remove_alleles;
+    bcf_remove_filter;
+    bcf_sr_add_reader;
+    bcf_sr_destroy;
+    bcf_sr_init;
+    bcf_sr_next_line;
+    bcf_sr_regions_destroy;
+    bcf_sr_regions_flush;
+    bcf_sr_regions_init;
+    bcf_sr_regions_next;
+    bcf_sr_regions_overlap;
+    bcf_sr_regions_seek;
+    bcf_sr_remove_reader;
+    bcf_sr_seek;
+    bcf_sr_set_regions;
+    bcf_sr_set_samples;
+    bcf_sr_set_targets;
+    bcf_subset;
+    bcf_subset_format;
+    bcf_sweep_bwd;
+    bcf_sweep_destroy;
+    bcf_sweep_fwd;
+    bcf_sweep_hdr;
+    bcf_sweep_init;
+    bcf_translate;
+    bcf_trim_alleles;
+    bcf_type_shift;
+    bcf_unpack;
+    bcf_update_alleles;
+    bcf_update_alleles_str;
+    bcf_update_filter;
+    bcf_update_format;
+    bcf_update_format_string;
+    bcf_update_id;
+    bcf_update_info;
+    bcf_write;
+    bgzf_check_EOF;
+    bgzf_close;
+    bgzf_dopen;
+    bgzf_flush;
+    bgzf_flush_try;
+    bgzf_getc;
+    bgzf_getline;
+    bgzf_hopen;
+    bgzf_index_build_init;
+    bgzf_index_dump;
+    bgzf_index_load;
+    bgzf_is_bgzf;
+    bgzf_mt;
+    bgzf_open;
+    bgzf_raw_read;
+    bgzf_raw_write;
+    bgzf_read;
+    bgzf_read_block;
+    bgzf_seek;
+    bgzf_set_cache_size;
+    bgzf_useek;
+    bgzf_utell;
+    bgzf_write;
+    cram_close;
+    cram_compress_block;
+    cram_dopen;
+    cram_eof;
+    cram_flush;
+    cram_free_block;
+    cram_free_container;
+    cram_new_block;
+    cram_new_container;
+    cram_open;
+    cram_read_block;
+    cram_read_container;
+    cram_seek;
+    cram_set_header;
+    cram_set_option;
+    cram_set_voption;
+    cram_uncompress_block;
+    cram_write_block;
+    cram_write_container;
+    fai_build;
+    fai_destroy;
+    fai_fetch;
+    fai_load;
+    faidx_fetch_nseq;
+    faidx_fetch_seq;
+    faidx_has_seq;
+    hclose;
+    hclose_abruptly;
+    hdopen;
+    hfile_destroy;
+    hfile_init;
+    hfile_oflags;
+    hflush;
+    hgetc2;
+    hopen;
+    hpeek;
+    hputc2;
+    hputs2;
+    hread2;
+    hrec_add_idx;
+    hseek;
+    hts_close;
+    hts_file_type;
+    hts_get_bgzfp;
+    hts_getline;
+    hts_idx_destroy;
+    hts_idx_finish;
+    hts_idx_get_meta;
+    hts_idx_get_n_no_coor;
+    hts_idx_get_stat;
+    hts_idx_init;
+    hts_idx_load;
+    hts_idx_push;
+    hts_idx_save;
+    hts_idx_seqnames;
+    hts_idx_set_meta;
+    hts_itr_destroy;
+    hts_itr_next;
+    hts_itr_query;
+    hts_itr_querys;
+    hts_open;
+    hts_parse_reg;
+    hts_readlines;
+    hts_readlist;
+    hts_set_fai_filename;
+    hts_set_threads;
+    hts_verbose;
+    hts_version;
+    hwrite2;
+    kf_betai;
+    kf_erfc;
+    kf_gammap;
+    kf_gammaq;
+    kf_lgamma;
+    kmemmem;
+    knet_close;
+    knet_dopen;
+    knet_open;
+    knet_read;
+    knet_seek;
+    ksplit_core;
+    ksprintf;
+    kstrnstr;
+    kstrstr;
+    kstrtok;
+    kt_fisher_exact;
+    kvsprintf;
+    sam_format1;
+    sam_hdr_add_lines;
+    sam_hdr_dup;
+    sam_hdr_incr_ref;
+    sam_hdr_length;
+    sam_hdr_parse;
+    sam_hdr_read;
+    sam_hdr_str;
+    sam_hdr_write;
+    sam_index_load;
+    sam_itr_queryi;
+    sam_itr_querys;
+    sam_open_mode;
+    sam_parse1;
+    sam_read1;
+    sam_write1;
+    seq_nt16_str;
+    seq_nt16_table;
+    stringify_argv;
+    tbx_conf_bed;
+    tbx_conf_gff;
+    tbx_conf_psltbl;
+    tbx_conf_sam;
+    tbx_conf_vcf;
+    tbx_destroy;
+    tbx_index;
+    tbx_index_build;
+    tbx_index_load;
+    tbx_name2id;
+    tbx_readrec;
+    tbx_seqnames;
+    vcf_format;
+    vcf_hdr_read;
+    vcf_hdr_write;
+    vcf_parse;
+    vcf_read;
+    vcf_write;
+    vcf_write_line;
+};
+
+HTSLIB_1.1 {
+    bcf_get_fmt_id;
+    bcf_get_info_id;
+    faidx_iseq;
+    faidx_nseq;
+    faidx_seq_len;
+} HTSLIB_1.0;
+
+
+HTSLIB_1.2.1 {
+    bcf_copy;
+    bcf_sr_strerror;
+    hisremote;
+    hts_detect_format;
+    hts_format_description;
+    hts_get_format;
+    hts_hopen;
+    hts_set_opt;
+    regidx_destroy;
+    regidx_init;
+    regidx_insert;
+    regidx_nregs;
+    regidx_overlap;
+    regidx_parse_bed;
+    regidx_parse_tab;
+    regidx_seq_names;
+    regidx_seq_nregs;
+    seq_nt16_int;
+} HTSLIB_1.1;
+
+HTSLIB_1.3 {
+    bcf_add_id;
+    bcf_empty;
+    bcf_hdr_merge;
+    bcf_index_build2;
+    bcf_index_load2;
+    bcf_remove_allele_set;
+    bgzf_compress;
+    cram_block_append;
+    cram_block_get_comp_size;
+    cram_block_get_content_id;
+    cram_block_get_content_type;
+    cram_block_get_crc32;
+    cram_block_get_data;
+    cram_block_get_offset;
+    cram_block_get_uncomp_size;
+    cram_block_set_comp_size;
+    cram_block_set_content_id;
+    cram_block_set_crc32;
+    cram_block_set_data;
+    cram_block_set_offset;
+    cram_block_set_uncomp_size;
+    cram_block_size;
+    cram_block_update_size;
+    cram_container_get_landmarks;
+    cram_container_get_length;
+    cram_container_get_num_blocks;
+    cram_container_is_empty;
+    cram_container_set_landmarks;
+    cram_container_set_length;
+    cram_container_set_num_blocks;
+    cram_container_size;
+    cram_copy_slice;
+    cram_fd_get_fp;
+    cram_fd_get_header;
+    cram_fd_get_version;
+    cram_fd_set_fp;
+    cram_fd_set_header;
+    cram_fd_set_version;
+    cram_major_vers;
+    cram_minor_vers;
+    cram_store_container;
+    cram_transcode_rg;
+    hfile_add_scheme_handler;
+    hfile_always_local;
+    hfile_always_remote;
+    hts_format_file_extension;
+    hts_idx_load2;
+    hts_idx_save_as;
+    hts_md5_destroy;
+    hts_md5_final;
+    hts_md5_hex;
+    hts_md5_init;
+    hts_md5_reset;
+    hts_md5_update;
+    hts_open_format;
+    hts_opt_add;
+    hts_opt_apply;
+    hts_opt_free;
+    hts_parse_decimal;
+    hts_parse_format;
+    hts_parse_opt_list;
+    int32_put_blk;
+    kgetline;
+    sam_index_build;
+    sam_index_build2;
+    sam_index_load2;
+    sam_open_mode_opts;
+    tbx_index_build2;
+    tbx_index_load2;
+} HTSLIB_1.2.1;
+
+HTSLIB_1.4 {
+    bam_auxB2f;
+    bam_auxB2i;
+    bam_auxB_len;
+    bam_aux_update_str;
+    bam_mplp_constructor;
+    bam_mplp_destructor;
+    bam_mplp_reset;
+    bam_plp_constructor;
+    bam_plp_destructor;
+    bcf_hdr_format;
+    bcf_index_build3;
+    bcf_sr_destroy_threads;
+    bcf_sr_set_opt;
+    bcf_sr_set_threads;
+    bgzf_block_write;
+    bgzf_compression;
+    bgzf_index_dump_hfile;
+    bgzf_index_load_hfile;
+    bgzf_thread_pool;
+    cram_check_EOF;
+    cram_get_refs;
+    errmod_cal;
+    errmod_destroy;
+    errmod_init;
+    fai_build3;
+    fai_load3;
+    hgetdelim;
+    hgets;
+    hts_check_EOF;
+    hts_json_fnext;
+    hts_json_fskip_value;
+    hts_json_snext;
+    hts_json_sskip_value;
+    hts_realloc_or_die;
+    hts_set_cache_size;
+    hts_set_thread_pool;
+    hts_tpool_delete_result;
+    hts_tpool_destroy;
+    hts_tpool_dispatch;
+    hts_tpool_dispatch2;
+    hts_tpool_init;
+    hts_tpool_kill;
+    hts_tpool_next_result;
+    hts_tpool_next_result_wait;
+    hts_tpool_process_attach;
+    hts_tpool_process_destroy;
+    hts_tpool_process_detach;
+    hts_tpool_process_empty;
+    hts_tpool_process_flush;
+    hts_tpool_process_init;
+    hts_tpool_process_len;
+    hts_tpool_process_qsize;
+    hts_tpool_process_ref_decr;
+    hts_tpool_process_ref_incr;
+    hts_tpool_process_reset;
+    hts_tpool_process_shutdown;
+    hts_tpool_process_sz;
+    hts_tpool_result_data;
+    hts_tpool_size;
+    hts_tpool_wake_dispatch;
+    kputd;
+    probaln_glocal;
+    sam_cap_mapq;
+    sam_index_build3;
+    sam_prob_realn;
+    tbx_index_build3;
+} HTSLIB_1.3;
+
+HTSLIB_1.5 {
+    hfile_set_blksize;
+    hts_get_log_level;
+    hts_log;
+    hts_set_log_level;
+} HTSLIB_1.4;
+
+HTSLIB_1.6 {
+    hts_drand48;
+    hts_erand48;
+    hts_lrand48;
+    hts_srand48;
+} HTSLIB_1.5;
+
+HTSLIB_1.7 {
+    hfile_mem_get_buffer;
+    hfile_mem_steal_buffer;
+    hts_itr_multi_bam;
+    hts_itr_multi_cram;
+    hts_itr_multi_next;
+    hts_itr_regions;
+    hts_json_alloc_token;
+    hts_json_free_token;
+    hts_json_token_str;
+    hts_json_token_type;
+    hts_reglist_free;
+    sam_hdr_change_HD;
+    sam_itr_regions;
+} HTSLIB_1.6;
+
+HTSLIB_1.9 {
+    bam_aux_update_array;
+    bam_aux_update_float;
+    bam_aux_update_int;
+    fai_fetchqual;
+    fai_load3_format;
+    fai_load_format;
+    faidx_fetch_qual;
+} HTSLIB_1.7;
+
+HTSLIB_1.10 {
+    bam_cigar_table;
+    bam_mplp64_auto;
+    bam_plp64_auto;
+    bam_plp64_next;
+    bam_plp_insertion;
+    bam_set_qname;
+    bcf_idx_init;
+    bcf_idx_save;
+    bcf_index_load3;
+    bgzf_peek;
+    fai_fetch64;
+    fai_fetchqual64;
+    fai_parse_region;
+    fai_set_cache_size;
+    faidx_fetch_qual64;
+    faidx_fetch_seq64;
+    haddextension;
+    hts_free;
+    hts_idx_fmt;
+    hts_idx_load3;
+    hts_idx_tbi_name;
+    hts_parse_reg64;
+    hts_parse_region;
+    hts_reglist_create;
+    hts_resize_array_;
+    hts_tpool_dispatch3;
+    kgetline2;
+    regidx_init_string;
+    regidx_insert_list;
+    regidx_parse_reg;
+    regidx_parse_vcf;
+    regidx_push;
+    regitr_copy;
+    regitr_destroy;
+    regitr_init;
+    regitr_loop;
+    regitr_overlap;
+    regitr_reset;
+    sam_hdr_add_line;
+    sam_hdr_add_pg;
+    sam_hdr_count_lines;
+    sam_hdr_destroy;
+    sam_hdr_find_line_id;
+    sam_hdr_find_line_pos;
+    sam_hdr_find_tag_id;
+    sam_hdr_find_tag_pos;
+    sam_hdr_init;
+    sam_hdr_line_index;
+    sam_hdr_line_name;
+    sam_hdr_name2tid;
+    sam_hdr_nref;
+    sam_hdr_pg_id;
+    sam_hdr_remove_except;
+    sam_hdr_remove_line_id;
+    sam_hdr_remove_line_pos;
+    sam_hdr_remove_lines;
+    sam_hdr_remove_tag_id;
+    sam_hdr_tid2len;
+    sam_hdr_tid2name;
+    sam_hdr_update_line;
+    sam_idx_init;
+    sam_idx_save;
+    sam_index_load3;
+    sam_itr_regarray;
+    sam_parse_region;
+    tbx_index_load3;
+} HTSLIB_1.9;
+
+HTSLIB_1.11 {
+    fai_path;
+    hts_lib_shutdown;
+    hts_tpool_process_is_shutdown;
+    vcf_open_mode;
+} HTSLIB_1.10;
+
+HTSLIB_1.12 {
+    bam_parse_cigar;
+    bam_set1;
+    hfile_has_plugin;
+    hfile_list_plugins;
+    hfile_list_schemes;
+    hts_feature_string;
+    hts_features;
+    hts_filter_eval;
+    hts_filter_free;
+    hts_filter_init;
+    hts_set_filter_expression;
+    hts_test_feature;
+    sam_parse_cigar;
+    sam_passes_filter;
+} HTSLIB_1.11;
+
+HTSLIB_1.13 {
+    hts_idx_nseq;
+} HTSLIB_1.12;
+
+HTSLIB_1.14 {
+    bam_mods_at_next_pos;
+    bam_mods_at_qpos;
+    bam_next_basemod;
+    bam_parse_basemod;
+    bam_plp_insertion_mod;
+    hts_base_mod_state_alloc;
+    hts_base_mod_state_free;
+    hts_flush;
+} HTSLIB_1.13;
+
+HTSLIB_1.15 {
+    hts_detect_format2;
+} HTSLIB_1.14;
+
+HTSLIB_1.16 {
+    bam_mods_query_type;
+    bam_mods_recorded;
+    bcf_has_variant_type;
+    bcf_has_variant_types;
+    bcf_variant_length;
+    cram_decode_slice_header;
+    cram_free_slice_header;
+    cram_slice_hdr_get_coords;
+    cram_slice_hdr_get_embed_ref_id;
+    cram_slice_hdr_get_num_blocks;
+    hts_filter_eval2;
+} HTSLIB_1.15;
+
+HTSLIB_1.17 {
+    bam_aux_first;
+    bam_aux_next;
+    bam_aux_remove;
+    bcf_strerror;
+    cram_block_get_method;
+    cram_cid2ds_free;
+    cram_cid2ds_query;
+    cram_codec_describe;
+    cram_codec_get_content_ids;
+    cram_container_get_num_bases;
+    cram_container_get_num_records;
+    cram_decode_compression_header;
+    cram_describe_encodings;
+    cram_expand_method;
+    cram_free_compression_header;
+    cram_update_cid2ds_map;
+    fai_adjust_region;
+    fai_line_length;
+    faidx_seq_len64;
+} HTSLIB_1.16;
diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index c4ba85679..cb789ad53 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -303,7 +303,8 @@ typedef struct BGZF BGZF;
      * @param fp     BGZF file handler
      * @param delim  delimiter
      * @param str    string to write to; must be initialized
-     * @return       length of the string; -1 on end-of-file; <= -2 on error
+     * @return       length of the string (capped at INT_MAX);
+     *               -1 on end-of-file; <= -2 on error
      */
     HTSLIB_EXPORT
     int bgzf_getline(BGZF *fp, int delim, struct kstring_t *str);
@@ -315,23 +316,22 @@ typedef struct BGZF BGZF;
     int bgzf_read_block(BGZF *fp) HTS_RESULT_USED;
 
     /**
-     * Enable multi-threading (when compiled with -DBGZF_MT) via a shared
-     * thread pool.  This means both encoder and decoder can balance
-     * usage across a single pool of worker jobs.
+     * Enable multi-threading via a shared thread pool.  This means
+     * both encoder and decoder can balance usage across a single pool
+     * of worker jobs.
      *
-     * @param fp          BGZF file handler; must be opened for writing
+     * @param fp          BGZF file handler
      * @param pool        The thread pool (see hts_create_threads)
      */
     HTSLIB_EXPORT
     int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize);
 
     /**
-     * Enable multi-threading (only effective when the library was compiled
-     * with -DBGZF_MT)
+     * Enable multi-threading
      *
-     * @param fp          BGZF file handler; must be opened for writing
-     * @param n_threads   #threads used for writing
-     * @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
+     * @param fp          BGZF file handler
+     * @param n_threads   #threads used for reading / writing
+     * @param n_sub_blks  Unused (was #blocks processed by each thread)
      */
     HTSLIB_EXPORT
     int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
diff --git a/htslib/cram.h b/htslib/cram.h
index 8dc6fe1b3..e0b51839c 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -1,7 +1,7 @@
 /// @file htslib/cram.h
 /// CRAM format-specific API functions.
 /*
-    Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -76,8 +76,59 @@ enum cram_block_method {
     RANS1    = 10,
     GZIP_RLE = 11,
 };
+#else
+
+// Values as defined in the CRAM specifications.
+// See cram/cram_structs.h cram_block_method_int for an expanded version of
+// this with local specialisations assigned to codes.
+enum cram_block_method {
+    CRAM_COMP_UNKNOWN = -1,
+
+    // CRAM 2.x and 3.0
+    CRAM_COMP_RAW      = 0,
+    CRAM_COMP_GZIP     = 1,
+    CRAM_COMP_BZIP2    = 2,
+
+    // CRAM 3.0
+    CRAM_COMP_LZMA     = 3,
+    CRAM_COMP_RANS4x8  = 4, // 4-way interleaving, 8-bit renormalisation
+
+    // CRAM 3.1
+    CRAM_COMP_RANSNx16 = 5, // both 4x16 and 32x16 variants, plus transforms
+    CRAM_COMP_ARITH    = 6, // aka Range coding
+    CRAM_COMP_FQZ      = 7, // FQZComp
+    CRAM_COMP_TOK3     = 8, // Name tokeniser
+};
 #endif
 
+/* NOTE this structure may be expanded in future releases by appending
+ * additional fields.
+ *
+ * Do not assume the size is fixed and avoid using arrays of this struct.
+ */
+typedef struct {
+    enum cram_block_method method;
+
+    // Generic compression level if known (0 if not).
+    // 1 or 9 for gzip min/max flag (else 5).  1-9 for bzip2
+    // 1 or 11 for for tok3 (rans/arith encoder).
+    int level;
+
+    // For rans* and arith codecs
+    int order;
+
+    // ransNx16/arith specific
+    int rle;
+    int pack;
+    int stripe;
+    int cat;
+    int nosz;
+    int Nway;
+
+    // Arithmetic coder only
+    int ext; // external: use gz, xz or bzip2
+} cram_method_details;
+
 enum cram_content_type {
     CT_ERROR           = -1,
     FILE_HEADER        = 0,
@@ -97,6 +148,7 @@ typedef struct cram_slice cram_slice;
 typedef struct cram_metrics cram_metrics;
 typedef struct cram_block_slice_hdr cram_block_slice_hdr;
 typedef struct cram_block_compression_hdr cram_block_compression_hdr;
+typedef struct cram_codec cram_codec;
 typedef struct refs_t refs_t;
 
 struct hFILE;
@@ -147,6 +199,10 @@ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
 HTSLIB_EXPORT
 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
                                   int32_t *landmarks);
+HTSLIB_EXPORT
+int32_t cram_container_get_num_records(cram_container *c);
+HTSLIB_EXPORT
+int64_t cram_container_get_num_bases(cram_container *c);
 
 /* Returns true if the container is empty (EOF marker) */
 HTSLIB_EXPORT
@@ -167,9 +223,14 @@ HTSLIB_EXPORT
 int32_t cram_block_get_crc32(cram_block *b);
 HTSLIB_EXPORT
 void *  cram_block_get_data(cram_block *b);
-
 HTSLIB_EXPORT
 enum cram_content_type cram_block_get_content_type(cram_block *b);
+HTSLIB_EXPORT
+enum cram_block_method cram_block_get_method(cram_block *b);
+
+HTSLIB_EXPORT
+cram_method_details *cram_expand_method(uint8_t *data, int32_t size,
+                                        enum cram_block_method comp);
 
 HTSLIB_EXPORT
 void cram_block_set_content_id(cram_block *b, int32_t id);
@@ -200,6 +261,27 @@ void cram_block_set_offset(cram_block *b, size_t offset);
 HTSLIB_EXPORT
 uint32_t cram_block_size(cram_block *b);
 
+/*
+ * Returns the Block Content ID values referred to by a cram_codec in
+ * ids[2].
+ *
+ * -2 is unused.
+ * -1 is CORE
+ * >= 0 is the block with that Content ID
+ */
+HTSLIB_EXPORT
+void cram_codec_get_content_ids(cram_codec *c, int ids[2]);
+
+/*
+ * Produces a human readable description of the codec parameters.
+ * This is appended to an existing kstring 'ks'.
+ *
+ * Returns 0 on succes,
+ *        <0 on failure
+ */
+HTSLIB_EXPORT
+int cram_codec_describe(cram_codec *c, kstring_t *ks);
+
 /*
  * Renumbers RG numbers in a cram compression header.
  *
@@ -247,6 +329,66 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
 HTSLIB_EXPORT
 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
 
+/*
+ * Decodes a CRAM block compression header.
+ * Returns header ptr on success
+ *         NULL on failure
+ */
+HTSLIB_EXPORT
+cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
+                                                           cram_block *b);
+/*
+ * Frees a cram_block_compression_hdr structure.
+ */
+HTSLIB_EXPORT
+void cram_free_compression_header(cram_block_compression_hdr *hdr);
+
+typedef struct cram_cid2ds_t cram_cid2ds_t;
+
+/*
+ * Map cram block numbers to data-series.  It's normally a 1:1 mapping,
+ * but in rare cases it can be 1:many (or even many:many).
+ * The key is the block number and the value is an index into the data-series
+ * array, which we iterate over until reaching a negative value.
+ *
+ * Provide cid2ds as NULL to allocate a new map or pass in an existing one
+ * to append to this map.  The new (or existing) map is returned.
+ *
+ * Returns the cid2ds (newly allocated or as provided) on success,
+ *         NULL on failure.
+ */
+HTSLIB_EXPORT
+cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr,
+                                      cram_cid2ds_t *cid2ds);
+
+/*
+ * Return a list of data series observed as belonging to a block with
+ * the specified content_id.  *n is the number of data series
+ * returned, or 0 if block is unused.
+ * Block content_id of -1 is used to indicate the CORE block.
+ *
+ * The pointer returned is owned by the cram_cid2ds state and should
+ * not be freed by the caller.
+ */
+HTSLIB_EXPORT
+int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n);
+
+/*
+ * Frees a cram_cid2ds_t allocated by cram_update_cid2ds_map
+ */
+HTSLIB_EXPORT
+void cram_cid2ds_free(cram_cid2ds_t *cid2ds);
+
+/*
+ * Produces a description of the record and tag encodings held within
+ * a compression header and appends to 'ks'.
+ *
+ * Returns 0 on success,
+ *        <0 on failure.
+ */
+HTSLIB_EXPORT
+int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks);
+
 /*
  *-----------------------------------------------------------------------------
  * cram slice interrogation
diff --git a/htslib/faidx.h b/htslib/faidx.h
index 149cebd2e..c1b3090a5 100644
--- a/htslib/faidx.h
+++ b/htslib/faidx.h
@@ -1,7 +1,7 @@
 /// @file htslib/faidx.h
 /// FASTA random access.
 /*
-   Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd.
+   Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022 Genome Research Ltd.
 
    Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -188,6 +188,15 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
 HTSLIB_EXPORT
 char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len);
 
+/// Query the line-wrap length for a chromosome specified as part of a region
+/** @param  fai  Pointer to the faidx_t struct
+    @param  reg  Region in the format "chr2:20,000-30,000"
+    @return      The line length (excluding newline),
+                 negative on error.
+*/
+HTSLIB_EXPORT
+hts_pos_t fai_line_length(const faidx_t *fai, const char *reg);
+
 /// Fetch the quality string for a region for FASTQ files
 /** @param  fai  Pointer to the faidx_t struct
     @param  reg  Region in the format "chr2:20,000-30,000"
@@ -283,7 +292,22 @@ int faidx_nseq(const faidx_t *fai);
 HTSLIB_EXPORT
 const char *faidx_iseq(const faidx_t *fai, int i);
 
-/// Return sequence length, -1 if not present
+/// Return sequence length
+/** @param  fai  Pointer to the faidx_t struct
+    @param  seq  Name of the sequence
+    @return Sequence length, or -1 if not present
+*/
+HTSLIB_EXPORT
+hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq);
+
+/// Return sequence length
+/** @param  fai  Pointer to the faidx_t struct
+    @param  seq  Name of the sequence
+    @return Sequence length, or -1 if not present
+
+    @deprecated This funtion cannot handle very long sequences.
+                Use faidx_seq_len64() instead.
+*/
 HTSLIB_EXPORT
 int faidx_seq_len(const faidx_t *fai, const char *seq);
 
@@ -305,6 +329,27 @@ const char *fai_parse_region(const faidx_t *fai, const char *s,
                              int *tid, hts_pos_t *beg, hts_pos_t *end,
                              int flags);
 
+/// Adjust region to the actual sequence length
+/** @param  fai   Pointer to the faidx_t struct
+    @param  tid   Sequence index, as returned by fai_parse_region()
+    @param  beg[in,out]   The start of the region (0 based)
+    @param  end[in,out]   One past end of the region (0 based)
+    @return 1, 2, or 3 if @p beg, @p end, or both are adjusted,
+            0 if @p beg and @p end are unchanged
+            -1 on error
+
+    Looks up the length of @p tid, and then adjusts the values of @p beg
+    and @p end if they fall outside the boundaries of the sequence.
+
+    If @p beg > @p end, it will be set to @p end.
+
+    The return value indicates which, if any, of the inputs have been
+    adjusted.  -1 will be returned if @p tid is not a valid sequence index.
+*/
+HTSLIB_EXPORT
+int fai_adjust_region(const faidx_t *fai, int tid,
+                      hts_pos_t *beg, hts_pos_t *end);
+
 /// Sets the cache size of the underlying BGZF compressed file
 /** @param  fai         Pointer to the faidx_t struct
  *  @param  cache_size  Selected cache size in bytes
diff --git a/htslib/hts.h b/htslib/hts.h
index 38246cd9c..403152d0c 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -489,7 +489,7 @@ const char *hts_version(void);
 // Immediately after release, bump ZZ to 90 to distinguish in-development
 // Git repository builds from the release; you may wish to increment this
 // further when significant features are merged.
-#define HTS_VERSION 101600
+#define HTS_VERSION 101700
 
 /*! @abstract Introspection on the features enabled in htslib
  *
diff --git a/htslib/sam.h b/htslib/sam.h
index 5f8c0a554..514a6be04 100644
--- a/htslib/sam.h
+++ b/htslib/sam.h
@@ -189,7 +189,7 @@ extern const int8_t bam_cigar_table[256];
  * Mate position and insert size also need to be 64-bit, but
  * we won't accept more than 32-bit for tid.
  *
- * The bam_core_t structure is the *in memory* layout and not
+ * The bam1_core_t structure is the *in memory* layout and not
  * the same as the on-disk format.  64-bit changes here permit
  * SAM to work with very long chromosomes and permit BAM and CRAM
  * to seamlessly update in the future without further API/ABI
@@ -1438,7 +1438,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,
 
 /// Converts a BAM aux tag to SAM format
 /*
- * @param b    Pointer to the bam record
  * @param key  Two letter tag key
  * @param type Single letter type code: ACcSsIifHZB.
  * @param tag  Tag data pointer, in BAM format
@@ -1628,6 +1627,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
     return NULL;
 }
 
+/// Return a pointer to a BAM record's first aux field
+/** @param b   Pointer to the BAM record
+    @return    Aux field pointer, or NULL if the record has none
+
+When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers"
+point to the TYPE byte within the auxiliary data for that field; but in general
+it is unnecessary for user code to be aware of this.)
+ */
+HTSLIB_EXPORT
+uint8_t *bam_aux_first(const bam1_t *b);
+
+/// Return a pointer to a BAM record's next aux field
+/** @param b   Pointer to the BAM record
+    @param s   Aux field pointer, as returned by bam_aux_first()/_next()/_get()
+    @return    Pointer to the next aux field, or NULL if no next field or error
+
+Whenever NULL is returned, errno will also be set: ENOENT if @p s was the
+record's last aux field; otherwise EINVAL, indicating that the BAM record's
+aux data is corrupt.
+ */
+HTSLIB_EXPORT
+uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s);
+
 /// Return a pointer to an aux record
 /** @param b   Pointer to the bam record
     @param tag Desired aux tag
@@ -1640,6 +1662,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
 HTSLIB_EXPORT
 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
 
+/// Return the aux field's 2-character tag
+/** @param s   Aux field pointer, as returned by bam_aux_first()/_next()/_get()
+    @return    Pointer to the tag characters, NOT NUL-terminated
+ */
+static inline
+const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); }
+
+/// Return the aux field's type character
+/** @param s   Aux field pointer, as returned by bam_aux_first()/_next()/_get()
+    @return    The type character: one of cCsSiI/fd/A/Z/H/B
+ */
+static inline char bam_aux_type(const uint8_t *s) { return *s; }
+
 /// Return a SAM formatting string containing a BAM tag
 /** @param b   Pointer to the bam record
     @param tag Desired aux tag
@@ -1742,15 +1777,33 @@ HTSLIB_EXPORT
 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data);
 
 /// Delete tag data from a bam record
-/* @param b The bam record to update
-   @param s Pointer to the tag to delete, as returned by bam_aux_get().
-   @return 0 on success; -1 on failure
-   If the bam record's aux data is corrupt, errno is set to EINVAL and this
-   function returns -1;
+/** @param b   The BAM record to update
+    @param s   Pointer to the aux field to delete, as returned by bam_aux_get()
+               Must not be NULL
+    @return    0 on success; -1 on failure
+
+If the BAM record's aux data is corrupt, errno is set to EINVAL and this
+function returns -1.
 */
 HTSLIB_EXPORT
 int bam_aux_del(bam1_t *b, uint8_t *s);
 
+/// Delete an aux field from a BAM record
+/** @param b   The BAM record to update
+    @param s   Pointer to the aux field to delete, as returned by
+               bam_aux_first()/_next()/_get(); must not be NULL
+    @return    Pointer to the following aux field, or NULL if none or on error
+
+Identical to @c bam_aux_del() apart from the return value, which is an
+aux iterator suitable for use with @c bam_aux_next()/etc.
+
+Whenever NULL is returned, errno will also be set: ENOENT if the aux field
+deleted was the record's last one; otherwise EINVAL, indicating that the
+BAM record's aux data is corrupt.
+ */
+HTSLIB_EXPORT
+uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s);
+
 /// Update or add a string-type tag
 /* @param b    The bam record to update
    @param tag  Tag identifier
diff --git a/htslib/vcf.h b/htslib/vcf.h
index c94bea589..0d9f812ce 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -2,7 +2,7 @@
 /// High-level VCF/BCF variant calling file operations.
 /*
     Copyright (C) 2012, 2013 Broad Institute.
-    Copyright (C) 2012-2020 Genome Research Ltd.
+    Copyright (C) 2012-2020, 2022 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -205,6 +205,22 @@ typedef struct bcf_dec_t {
 #define BCF_ERR_CTG_INVALID   32
 #define BCF_ERR_TAG_INVALID   64
 
+/// Get error description for bcf error code
+/** @param errorcode  The error code which is to be described
+    @param buffer     The buffer in which description to be added
+    @param maxbuffer  The size of buffer passed
+    @return NULL on invalid buffer; buffer on other cases
+
+The buffer will be an empty string when @p errorcode is 0.
+Description of errors present in code will be appended to @p buffer with ',' separation.
+The buffer has to be at least 4 characters long. NULL will be returned if it is smaller or when buffer is NULL.
+
+'...' will be appended if the description doesn't fit in the given buffer.
+ */
+
+HTSLIB_EXPORT
+const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer);
+
 /*
     The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
     is slower because the string is first to be parsed, packed into BCF line
@@ -620,7 +636,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     HTSLIB_EXPORT
     bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap);
 
-    /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */
+    /**
+     *  Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names).
+     *  NB: sequence name indexes returned by bcf_hdr_seqnames() may not correspond to bcf1_t.rid, use
+     *  bcf_hdr_id2name() or bcf_seqname() instead.
+     */
     HTSLIB_EXPORT
     const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs);
 
@@ -1210,7 +1230,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     #define bcf_hdr_id2number(hdr,type,int_id)  ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12)
     #define bcf_hdr_id2type(hdr,type,int_id)    (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf)
     #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf)
-    #define bcf_hdr_idinfo_exists(hdr,type,int_id)  ((int_id)>=0 && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf)
+    #define bcf_hdr_idinfo_exists(hdr,type,int_id)  ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf)
     #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id)    ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)])
     /// Convert BCF FORMAT data to string form
     /**
diff --git a/sam.c b/sam.c
index c95d1c693..e2e539b2d 100644
--- a/sam.c
+++ b/sam.c
@@ -1,6 +1,6 @@
 /*  sam.c -- SAM and BAM file I/O and manipulation.
 
-    Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd.
+    Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd.
     Copyright (C) 2010, 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -3853,7 +3853,6 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
     }
 
     // Name
-
     if (*x->name.s != x->nprefix)
         return -2;
 
@@ -3893,8 +3892,8 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
         if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
             if (fp->format.format == fastq_format || ret < -1)
                 return -2;
-        if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>')
-            || ret == -1)
+        if (ret == -1 ||
+            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
             break;
         if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
             return -2;
@@ -4286,13 +4285,24 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
                      bc ? (char *)bc+1 : "0") < 0)
             return -1;
 
+        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
+            hts_log_warning("BC tag starts with non-sequence base; using '0'");
+            str->l -= strlen((char *)bc)-2; // limit to 1 char
+            str->s[str->l-1] = '0';
+            str->s[str->l] = 0;
+            bc = NULL;
+        }
+
         // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
         if (bc) {
             int l = strlen((char *)bc+1);
             char *c = (char *)str->s + str->l - l;
-            for (i = 0; i < l; i++)
+            for (i = 0; i < l; i++) {
                 if (!isalpha_c(c[i]))
                     c[i] = '+';
+                else if (islower_c(c[i]))
+                    c[i] = toupper_c(c[i]);
+            }
         }
     }
 
@@ -4614,31 +4624,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
     }
 }
 
+uint8_t *bam_aux_first(const bam1_t *b)
+{
+    uint8_t *s = bam_get_aux(b);
+    uint8_t *end = b->data + b->l_data;
+    if (s >= end) { errno = ENOENT; return NULL; }
+    return s+2;
+}
+
+uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
+{
+    uint8_t *end = b->data + b->l_data;
+    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
+    if (next == NULL) goto bad_aux;
+    if (next >= end) { errno = ENOENT; return NULL; }
+    return next+2;
+
+ bad_aux:
+    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
+    errno = EINVAL;
+    return NULL;
+}
+
 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
 {
-    uint8_t *s, *end, *t = (uint8_t *) tag;
-    uint16_t y = (uint16_t) t[0]<<8 | t[1];
-    s = bam_get_aux(b);
-    end = b->data + b->l_data;
-    while (s != NULL && end - s >= 3) {
-        uint16_t x = (uint16_t) s[0]<<8 | s[1];
-        s += 2;
-        if (x == y) {
+    uint8_t *s;
+    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
+        if (s[-2] == tag[0] && s[-1] == tag[1]) {
             // Check the tag value is valid and complete
-            uint8_t *e = skip_aux(s, end);
-            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
-                goto bad_aux;  // Unterminated string
-            }
-            if (e != NULL) {
-                return s;
-            } else {
-                goto bad_aux;
-            }
+            uint8_t *e = skip_aux(s, b->data + b->l_data);
+            if (e == NULL) goto bad_aux;
+            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
+
+            return s;
         }
-        s = skip_aux(s, end);
-    }
-    if (s == NULL) goto bad_aux;
-    errno = ENOENT;
+
+    // errno now as set by bam_aux_first()/bam_aux_next()
     return NULL;
 
  bad_aux:
@@ -4647,23 +4668,28 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
     return NULL;
 }
 
-// s MUST BE returned by bam_aux_get()
 int bam_aux_del(bam1_t *b, uint8_t *s)
 {
-    uint8_t *p, *aux;
-    int l_aux = bam_get_l_aux(b);
-    aux = bam_get_aux(b);
-    p = s - 2;
-    s = skip_aux(s, aux + l_aux);
-    if (s == NULL) goto bad_aux;
-    memmove(p, s, l_aux - (s - aux));
-    b->l_data -= s - p;
-    return 0;
+    s = bam_aux_remove(b, s);
+    return (s || errno == ENOENT)? 0 : -1;
+}
+
+uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
+{
+    uint8_t *end = b->data + b->l_data;
+    uint8_t *next = skip_aux(s, end);
+    if (next == NULL) goto bad_aux;
+
+    b->l_data -= next - (s-2);
+    if (next >= end) { errno = ENOENT; return NULL; }
+
+    memmove(s-2, next, end - next);
+    return s;
 
  bad_aux:
     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
     errno = EINVAL;
-    return -1;
+    return NULL;
 }
 
 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
@@ -5238,9 +5264,24 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
             int op2 = _cop(cigar[s->k+1]);
             int l2 = _cln(cigar[s->k+1]);
-            if (op2 == BAM_CDEL) p->indel = -(int)l2;
-            else if (op2 == BAM_CINS) p->indel = l2;
-            else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+            if (op2 == BAM_CDEL && op != BAM_CDEL) {
+                // At start of a new deletion, merge e.g. 1D2D to 3D.
+                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
+                // and rely on is_del=1 as we would for 3D.
+                p->indel = -(int)l2;
+                for (k = s->k+2; k < c->n_cigar; ++k) {
+                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+                    if (op2 == BAM_CDEL) p->indel -= l2;
+                    else break;
+                }
+            } else if (op2 == BAM_CINS) {
+                p->indel = l2;
+                for (k = s->k+2; k < c->n_cigar; ++k) {
+                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+                    if (op2 == BAM_CINS) p->indel += l2;
+                    else if (op2 != BAM_CPAD) break;
+                }
+            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
                 int l3 = 0;
                 for (k = s->k + 2; k < c->n_cigar; ++k) {
                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
@@ -5326,8 +5367,10 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p,
             break;
         case BAM_CINS:
             for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
-                c = seq_nt16_str[bam_seqi(bam_get_seq(p->b),
-                                          p->qpos + j - p->is_del)];
+                c = p->qpos + j - p->is_del < p->b->core.l_qseq
+                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
+                                            p->qpos + j - p->is_del)]
+                    : 'N';
                 ins->s[indel++] = c;
                 int nm;
                 hts_base_mod mod[256];
diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c
index c980e3b45..23e0ecaef 100644
--- a/synced_bcf_reader.c
+++ b/synced_bcf_reader.c
@@ -623,7 +623,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
         {
             if ( reader->file->format.format==vcf )
             {
-                if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break;   // no more lines
+                ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps);
+                if ( ret < -1 ) files->errnum = bcf_read_error;
+                if ( ret < 0 ) break; // no more lines or an error
                 ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
                 if ( ret<0 ) { files->errnum = vcf_parse_error; break; }
             }
@@ -641,7 +643,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
         }
         else if ( reader->tbx_idx )
         {
-            if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break;  // no more lines
+            ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps);
+            if ( ret < -1 ) files->errnum = bcf_read_error;
+            if ( ret < 0 ) break; // no more lines or an error
             ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
             if ( ret<0 ) { files->errnum = vcf_parse_error; break; }
         }
diff --git a/tabix.1 b/tabix.1
index 15ac768e7..ce5a47a39 100644
--- a/tabix.1
+++ b/tabix.1
@@ -1,10 +1,10 @@
-.TH tabix 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools"
+.TH tabix 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools"
 .SH NAME
 .PP
 tabix \- Generic indexer for TAB-delimited genome position files
 .\"
 .\" Copyright (C) 2009-2011 Broad Institute.
-.\" Copyright (C) 2014, 2016, 2018, 2020 Genome Research Ltd.
+.\" Copyright (C) 2014, 2016, 2018, 2020, 2022 Genome Research Ltd.
 .\"
 .\" Author: Heng Li <lh3@sanger.ac.uk>
 .\"
@@ -101,7 +101,7 @@ start column. [5]
 Force to overwrite the index file if it is present.
 .TP
 .BI "-m, --min-shift " INT
-set minimal interval size for CSI indices to 2^INT [14]
+Set minimal interval size for CSI indices to 2^INT [14]
 .TP
 .BI "-p, --preset " STR
 Input format for indexing. Valid values are: gff, bed, sam, vcf.
@@ -199,6 +199,5 @@ implemented by Bob Handsaker and modified by Heng Li for remote file
 access and in-memory caching.
 
 .SH SEE ALSO
-.PP
-.BR bgzip (1),
-.BR samtools (1)
+.IR bgzip (1),
+.IR samtools (1)
diff --git a/tabix.c b/tabix.c
index a79a7b968..0798b279f 100644
--- a/tabix.c
+++ b/tabix.c
@@ -581,7 +581,7 @@ int main(int argc, char *argv[])
             case 1:
                 printf(
 "tabix (htslib) %s\n"
-"Copyright (C) 2022 Genome Research Ltd.\n", hts_version());
+"Copyright (C) 2023 Genome Research Ltd.\n", hts_version());
                 return EXIT_SUCCESS;
             case 2:
                 return usage(stdout, EXIT_SUCCESS);
diff --git a/tbx.c b/tbx.c
index 3af2c09fb..d897a21f1 100644
--- a/tbx.c
+++ b/tbx.c
@@ -1,6 +1,6 @@
 /*  tbx.c -- tabix API functions.
 
-    Copyright (C) 2009, 2010, 2012-2015, 2017-2020 Genome Research Ltd.
+    Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022 Genome Research Ltd.
     Copyright (C) 2010-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -91,9 +91,10 @@ int tbx_name2id(tbx_t *tbx, const char *ss)
     return get_tid(tbx, ss, 0);
 }
 
-int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
+int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
 {
-    int i, b = 0, id = 1;
+    size_t i, b = 0;
+    int id = 1;
     char *s;
     intv->ss = intv->se = 0; intv->beg = intv->end = -1;
     for (i = 0; i <= len; ++i) {
@@ -321,8 +322,11 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
             continue;
         }
         if (first == 0) {
-            if (fmt == HTS_FMT_CSI)
+            if (fmt == HTS_FMT_CSI) {
+                if (!max_ref_len)
+                    max_ref_len = (int64_t)100*1024*1024*1024; // 100G default
                 n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
+            }
             tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
             if (!tbx->idx) goto fail;
             first = 1;
diff --git a/test/faidx/ce.1.expected.fa b/test/faidx/ce.1.expected.fa
new file mode 100644
index 000000000..d606105c4
--- /dev/null
+++ b/test/faidx/ce.1.expected.fa
@@ -0,0 +1,8 @@
+>CHROMOSOME_I:5001-5125 length: 125
+AACTGGTTCAAAAACAAAAATTTTTTAAACTGTACAAACTGTCCAAAAAT
+TCGTCGTAAATCGACACACCCTTCTCATTTTTTCAAAATTTTAATTGTTT
+TCGAATGTTTTTTTTGCAGAATAAT
+>CHROMOSOME_X:101-225 length: 125
+GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC
+CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCT
+AAGCCTAAGCCTAAGCCTAAGCCTA
diff --git a/test/faidx/faidx.1.expected.fa b/test/faidx/faidx.1.expected.fa
new file mode 100644
index 000000000..d14656e9f
--- /dev/null
+++ b/test/faidx/faidx.1.expected.fa
@@ -0,0 +1,6 @@
+>trailingblank2:28-33 length: 6
+GGGCCC
+>trailingblank3:4-5 length: 2
+TA
+>bar:4-5 length: 2
+TA
diff --git a/test/faidx.fa b/test/faidx/faidx.fa
similarity index 100%
rename from test/faidx.fa
rename to test/faidx/faidx.fa
diff --git a/test/faidx/faidx.fa.expected.fai b/test/faidx/faidx.fa.expected.fai
new file mode 100644
index 000000000..b4d1aff26
--- /dev/null
+++ b/test/faidx/faidx.fa.expected.fai
@@ -0,0 +1,6 @@
+	4	2	4	5
+trailingblank1	33	23	12	13
+trailingblank2	72	111	24	25
+trailingblank3	5	234	4	6
+foo	8	252	6	7
+bar	8	280	8	9
diff --git a/test/faidx/faidx.tst b/test/faidx/faidx.tst
new file mode 100644
index 000000000..b6bd7cac1
--- /dev/null
+++ b/test/faidx/faidx.tst
@@ -0,0 +1,74 @@
+#    Copyright (C) 2022 Genome Research Ltd.
+#
+#    Author: Robert Davies <rmd@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# First field:
+#   INIT = initialisation, not counted in testing
+#   P = expected to pass (zero return; expected output matches, if present)
+#   N = expected to return non-zero
+#   F = expected to fail
+#
+# Second field (P/N/F only):
+#   Filename of expected output.  If '.', output is not checked
+#
+# Rest:
+#   Command to execute.  $bgzip and $test_faidx are replaced with the path to
+# bgzip and test_faidx.
+
+# Index fasta
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -e faidx.fa.expected.fai
+
+# Test various functions on the fasta index
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_line_length -e 24 trailingblank2
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai	-t faidx_has_seq -e 1 foo
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 0 absent
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_iseq -e trailingblank3 3
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len -e 33 trailingblank1
+P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len64 -e 72 trailingblank2
+
+# Index fastq
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -e fastqs.fq.expected.fai
+
+# Test various functions on the fastq index
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 63 FAKE0005_3
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 144 SRR014849.203935_3
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 1 SRR014849.203935_3
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 0 absent
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_iseq -e FAKE0005_1 0
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len -e 453 FSRRS4401CM938_1
+P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len64 -e 309 FSRRS4401AOV6A_4
+
+# Fasta retrieval tests
+P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai trailingblank2:28-33 trailingblank3:4-5 bar:4-5
+P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_fetch trailingblank2:28-33 trailingblank3:4-5 bar:4-5
+P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_fetch_seq64 trailingblank2:28-33 trailingblank3:4-5 bar:4-5
+P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_adjust_region trailingblank2:28-33 trailingblank3:4-5 bar:4-5
+
+# Fastq retrieval tests
+P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90
+P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_fetch FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90
+P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t faidx_fetch_seq64 FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90
+P fastqs.2.expected.fa $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90
+
+# Indexing and retrieval on bgzip compressed fasta
+INIT $bgzip -c < ../ce.fa > ce.fa.tmp.gz
+P . $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi -e ../ce.fa.fai
+P ce.1.expected.fa $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi CHROMOSOME_I:5001-5125 CHROMOSOME_X:101-225
diff --git a/test/faidx/fastqs.1.expected.fq b/test/faidx/fastqs.1.expected.fq
new file mode 100644
index 000000000..729393837
--- /dev/null
+++ b/test/faidx/fastqs.1.expected.fq
@@ -0,0 +1,16 @@
+@FAKE0006_1:4-12 length: 9
+TGCATGCAT
++
+{zyxwvuts
+@FSRRS4401BE7HA_1:81-120 length: 40
+GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca
++
+c^^^YRPOSNVU\YTMMMSMRKKKRUUNNNNS[`aa```\
+@FAKE0010_2 length: 30
+gatcrywsmkhbvdnGATCRYWSMKHBVDN
++
+I?5+I?5+I?5+I?5+I?5+I?5+I?5+I?
+@SRR014849.50939_3:71-90 length: 20
+CAATAAATCAATACATAAAA
++
+\aZ\d`OY[aY[[\[[e`WP
diff --git a/test/faidx/fastqs.2.expected.fa b/test/faidx/fastqs.2.expected.fa
new file mode 100644
index 000000000..9b67d15e7
--- /dev/null
+++ b/test/faidx/fastqs.2.expected.fa
@@ -0,0 +1,8 @@
+>FAKE0006_1:4-12 length: 9
+TGCATGCAT
+>FSRRS4401BE7HA_1:81-120 length: 40
+GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca
+>FAKE0010_2 length: 30
+gatcrywsmkhbvdnGATCRYWSMKHBVDN
+>SRR014849.50939_3:71-90 length: 20
+CAATAAATCAATACATAAAA
diff --git a/test/fastqs.fq b/test/faidx/fastqs.fq
similarity index 100%
rename from test/fastqs.fq
rename to test/faidx/fastqs.fq
diff --git a/test/faidx/fastqs.fq.expected.fai b/test/faidx/fastqs.fq.expected.fai
new file mode 100644
index 000000000..77ba04a5d
--- /dev/null
+++ b/test/faidx/fastqs.fq.expected.fai
@@ -0,0 +1,105 @@
+FAKE0005_1	63	85	63	64	151
+FAKE0006_1	63	300	63	64	366
+FAKE0005_2	63	515	63	64	581
+FAKE0006_2	63	730	63	64	796
+FAKE0005_3	63	945	63	64	1011
+FAKE0006_3	63	1160	63	64	1226
+FAKE0005_4	63	1375	63	64	1441
+FAKE0006_4	63	1590	63	64	1656
+FSRRS4401BE7HA_1	395	1823	395	396	2221
+FSRRS4401BRRTC_1	145	2720	145	146	2868
+FSRRS4401B64ST_1	382	3118	382	383	3503
+FSRRS4401EJ0YH_1	381	3990	381	382	4374
+FSRRS4401BK0IB_1	507	4860	507	508	5370
+FSRRS4401ARCCB_1	258	5982	258	259	6243
+FSRRS4401CM938_1	453	6606	453	454	7062
+FSRRS4401EQLIK_1	411	7620	411	412	8034
+FSRRS4401AOV6A_1	309	8550	309	310	8862
+FSRRS4401EG0ZW_1	424	9276	424	425	9703
+FSRRS4401BE7HA_2	395	10231	395	396	10629
+FSRRS4401BRRTC_2	145	11128	145	146	11276
+FSRRS4401B64ST_2	382	11526	382	383	11911
+FSRRS4401EJ0YH_2	381	12398	381	382	12782
+FSRRS4401BK0IB_2	507	13268	507	508	13778
+FSRRS4401ARCCB_2	258	14390	258	259	14651
+FSRRS4401CM938_2	453	15014	453	454	15470
+FSRRS4401EQLIK_2	411	16028	411	412	16442
+FSRRS4401AOV6A_2	309	16958	309	310	17270
+FSRRS4401EG0ZW_2	424	17684	424	425	18111
+FSRRS4401BE7HA_3	395	18639	395	396	19037
+FSRRS4401BRRTC_3	145	19536	145	146	19684
+FSRRS4401B64ST_3	382	19934	382	383	20319
+FSRRS4401EJ0YH_3	381	20806	381	382	21190
+FSRRS4401BK0IB_3	507	21676	507	508	22186
+FSRRS4401ARCCB_3	258	22798	258	259	23059
+FSRRS4401CM938_3	453	23422	453	454	23878
+FSRRS4401EQLIK_3	411	24436	411	412	24850
+FSRRS4401AOV6A_3	309	25366	309	310	25678
+FSRRS4401EG0ZW_3	424	26092	424	425	26519
+FSRRS4401BE7HA_4	395	27047	80	81	27449
+FSRRS4401BRRTC_4	145	27952	80	81	28101
+FSRRS4401B64ST_4	382	28352	80	81	28741
+FSRRS4401EJ0YH_4	381	29232	80	81	29620
+FSRRS4401BK0IB_4	507	30110	80	81	30626
+FSRRS4401ARCCB_4	258	31244	80	81	31508
+FSRRS4401CM938_4	453	31874	80	81	32335
+FSRRS4401EQLIK_4	411	32898	80	81	33317
+FSRRS4401AOV6A_4	309	33838	80	81	34153
+FSRRS4401EG0ZW_4	424	34570	80	81	35002
+FAKE0007_1	41	35549	41	42	35593
+FAKE0008_1	41	35752	41	42	35796
+FAKE0009_1	41	35955	41	42	35999
+FAKE0010_1	30	36143	30	31	36176
+FAKE0007_2	41	36324	41	42	36368
+FAKE0008_2	41	36527	41	42	36571
+FAKE0009_2	41	36730	41	42	36774
+FAKE0010_2	30	36918	30	31	36951
+FAKE0007_3	41	37099	41	42	37143
+FAKE0008_3	41	37302	41	42	37346
+FAKE0009_3	41	37505	41	42	37549
+FAKE0010_3	30	37693	30	31	37726
+FAKE0007_4	41	37874	41	42	37918
+FAKE0008_4	41	38077	41	42	38121
+FAKE0009_4	41	38280	41	42	38324
+FAKE0010_4	30	38468	30	31	38501
+FAKE0011_1	41	38649	41	42	38693
+FAKE0012_1	41	38852	41	42	38896
+FAKE0013_1	41	39055	41	42	39099
+FAKE0014_1	30	39250	30	31	39283
+FAKE0011_2	41	39431	41	42	39475
+FAKE0012_2	41	39634	41	42	39678
+FAKE0013_2	41	39837	41	42	39881
+FAKE0014_2	30	40032	30	31	40065
+FAKE0011_3	41	40213	41	42	40257
+FAKE0012_3	41	40416	41	42	40460
+FAKE0013_3	41	40619	41	42	40663
+FAKE0014_3	30	40814	30	31	40847
+FAKE0011_4	41	40995	41	42	41039
+FAKE0012_4	41	41198	41	42	41242
+FAKE0013_4	41	41401	41	42	41445
+FAKE0014_4	30	41596	30	31	41629
+FAKE0001_1	94	41745	94	95	41842
+FAKE0002_1	94	42022	94	95	42119
+FAKE0001_2	94	42299	94	95	42396
+FAKE0002_2	94	42576	94	95	42673
+FAKE0001_3	94	42853	94	95	42950
+FAKE0002_3	94	43130	94	95	43227
+FAKE0001_4	94	43407	94	95	43504
+FAKE0002_4	94	43684	94	95	43781
+FAKE0003_1	68	43963	68	69	44034
+FAKE0004_1	68	44190	68	69	44261
+FAKE0003_2	68	44417	68	69	44488
+FAKE0004_2	68	44644	68	69	44715
+FAKE0003_3	68	44871	68	69	44942
+FAKE0004_3	68	45098	68	69	45169
+FAKE0003_4	68	45325	68	69	45396
+FAKE0004_4	68	45552	68	69	45623
+SRR014849.50939_1	135	45737	135	136	45875
+SRR014849.110027_1	131	46057	131	132	46191
+SRR014849.203935_1	144	46369	144	145	46516
+SRR014849.50939_2	135	46706	135	136	46844
+SRR014849.110027_2	131	47026	131	132	47160
+SRR014849.203935_2	144	47338	144	145	47485
+SRR014849.50939_3	135	47675	135	136	47813
+SRR014849.110027_3	131	47995	131	132	48129
+SRR014849.203935_3	144	48307	144	145	48454
diff --git a/test/faidx/test-faidx.sh b/test/faidx/test-faidx.sh
new file mode 100755
index 000000000..ae501e086
--- /dev/null
+++ b/test/faidx/test-faidx.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+#
+#    Copyright (C) 2022 Genome Research Ltd.
+#
+#    Author: Robert Davies <rmd@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Load in the test driver
+. ../simple_test_driver.sh
+
+echo "Testing faidx..."
+
+bgzip="../../bgzip"
+test_faidx="../test_faidx"
+
+test_driver $@
+
+exit $?
diff --git a/test/modhdr.expected.vcf b/test/modhdr.expected.vcf
new file mode 100644
index 000000000..bad663c7e
--- /dev/null
+++ b/test/modhdr.expected.vcf
@@ -0,0 +1,4 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=chr22,length=51304566>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
diff --git a/test/modhdr.vcf.gz b/test/modhdr.vcf.gz
new file mode 100644
index 000000000..f97e06ab3
Binary files /dev/null and b/test/modhdr.vcf.gz differ
diff --git a/test/modhdr.vcf.gz.csi b/test/modhdr.vcf.gz.csi
new file mode 100644
index 000000000..61b60e79a
Binary files /dev/null and b/test/modhdr.vcf.gz.csi differ
diff --git a/test/sam.c b/test/sam.c
index 036349f2b..28ca1bc5f 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type)
     return NULL;
 }
 
+static void check_aux_count(const bam1_t *aln, int expected, const char *what)
+{
+    const uint8_t *itr;
+    int n = 0;
+    for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++;
+    if (n != expected)
+        fail("%s has %d aux fields, expected %d", what, n, expected);
+}
+
 static void check_int_B_array(bam1_t *aln, char *tag,
                              uint32_t nvals, int64_t *vals) {
     uint8_t *p;
@@ -285,10 +294,30 @@ static int aux_fields1(void)
         if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
             fail("XA field is '%c', expected 'k'", bam_aux2A(p));
 
+        check_aux_count(aln, 24, "Original record");
+
         bam_aux_del(aln,p);
         if (bam_aux_get(aln,"XA"))
             fail("XA field was not deleted");
 
+        check_aux_count(aln, 23, "Record post-XA-deletion");
+
+        p = bam_aux_get(aln, "Y2");
+        if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i')
+            fail("bam_aux_get() missed Y2 field");
+
+        p = bam_aux_next(aln, p);
+        if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c')
+            fail("bam_aux_next() missed Y3 field");
+
+        p = bam_aux_get(aln, "Y8");
+        if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I')
+            fail("bam_aux_get() missed Y8 field");
+
+        p = bam_aux_next(aln, p);
+        if (p != NULL || errno != ENOENT)
+            fail("bam_aux_next missed the end of fields");
+
         if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
             fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));
 
@@ -492,6 +521,16 @@ static int aux_fields1(void)
 
         if (strcmp(ks.s, r1) != 0)
             fail("record formatted incorrectly: \"%s\"", ks.s);
+
+        // Test field removal APIs -- after the strcmp(..., r1) check so that
+        // can also check the formatting of the to-be-removed fields.
+
+        p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H'));
+        if (bam_aux_get(aln, "XH"))
+            fail("XH field was not removed");
+        check_aux_count(aln, 31, "Record post-XH-removal");
+        if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B')
+            fail("bam_aux_remove() missed XB field");
     }
     else fail("can't read record");
 
@@ -2224,7 +2263,7 @@ int main(int argc, char **argv)
     test_text_file("test/emptyfile", 0);
     test_text_file("test/xx#pair.sam", 7);
     test_text_file("test/xx.fa", 7);
-    test_text_file("test/fastqs.fq", 500);
+    test_text_file("test/faidx/fastqs.fq", 500);
     check_enum1();
     check_cigar_tab();
     check_big_ref(0);
diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst
index 129516b24..13d2c340e 100644
--- a/test/sam_filter/filter.tst
+++ b/test/sam_filter/filter.tst
@@ -1,4 +1,4 @@
-#    Copyright (C) 2020 Genome Research Ltd.
+#    Copyright (C) 2020, 2022 Genome Research Ltd.
 #
 #    Author: James Bonfield <jkb@sanger.ac.uk>
 #
@@ -42,17 +42,17 @@ P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam
 P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam
 
 # Integer ops
-P int1.out    $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@'
-P int2.out    $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@'
+P int1.out    $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam | grep -E -cv '^@'
+P int2.out    $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam | grep -E -cv '^@'
 
 # Aux tags
-P int3.out    $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@'
+P int3.out    $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam | grep -E -cv '^@'
 
 # Functions.
-P func1.out   $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@'
-P func2.out   $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@'
-P func3.out   $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@'
-P func4.out   $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@'
-P func5.out   $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@'
-P func6.out   $tv -i 'filter=rlen<50'   ../realn02.sam | egrep -v '^@'
-P func7.out   $tv -i 'filter=qlen>100'   ../realn02.sam | egrep -v '^@'
+P func1.out   $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | grep -E -cv '^@'
+P func2.out   $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | grep -E -cv '^@'
+P func3.out   $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | grep -E -cv '^@'
+P func4.out   $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | grep -E -cv '^@'
+P func5.out   $tv -i 'filter=sclen>=20' ../realn02.sam | grep -E -v '^@'
+P func6.out   $tv -i 'filter=rlen<50'   ../realn02.sam | grep -E -v '^@'
+P func7.out   $tv -i 'filter=qlen>100'   ../realn02.sam | grep -E -v '^@'
diff --git a/test/test.pl b/test/test.pl
index d6c01786a..1595557a2 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -398,6 +398,30 @@ sub test_bgzip {
     }
     passed($opts,$test);
 
+    # Round-trip test of text in binary mode
+    $test = sprintf('%s %2s threads', 'bgzip text mode round-trip',
+                       $threads ? $threads : 'no');
+    print "$test: ";
+    $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'";
+    ($ret, $out) = _cmd($c);
+    if ($ret) {
+        failed($opts, $test, "non-zero exit from $c");
+        return;
+    }
+    $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'";
+    ($ret, $out) = _cmd($c);
+    if ($ret) {
+        failed($opts, $test, "non-zero exit from $c");
+        return;
+    }
+    $c = "cmp '$data' '$uncompressed'";
+    ($ret, $out) = _cmd($c);
+    if ($ret) {
+        failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ");
+        return;
+    }
+    passed($opts,$test);
+
     # Extract from an offset
     $test = sprintf('%s %2s threads', 'bgzip -b',
                     $threads ? $threads : 'no');
@@ -637,14 +661,6 @@ sub test_view
             testv $opts, "./compare_sam.pl -Baux $md $sam $jsam";
         }
 
-        # embed_ref=2 mode
-        my $ersam = "ce#1000.sam";
-        my $ercram = "ce#1000_er.tmp.cram";
-        my $ersam2 = "${ercram}.sam";
-        testv $opts, "./test_view $tv_args -C -p $ercram $ersam";
-        testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
-        testv $opts, "./compare_sam.pl $ersam $ersam2";
-
         if ($test_view_failures == 0)
         {
             passed($opts, "$sam conversions");
@@ -655,6 +671,21 @@ sub test_view
         }
     }
 
+    # embed_ref=2 mode
+    print "test_view testing embed_ref=2:\n";
+    $test_view_failures = 0;
+    my $ersam = "ce#1000.sam";
+    my $ercram = "ce#1000_er.tmp.cram";
+    my $ersam2 = "${ercram}.sam";
+    testv $opts, "./test_view $tv_args -C -p $ercram $ersam";
+    testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
+    testv $opts, "./compare_sam.pl $ersam $ersam2";
+    if ($test_view_failures == 0) {
+        passed($opts, "embed_ref=2 tests");
+    } else {
+        failed($opts, "embed_ref=2 tests", "$test_view_failures subtests failed");
+    }
+
     # BAM and CRAM range queries on prebuilt BAM and CRAM
     # The cram file has @SQ UR: set to point to an invalid location to
     # force the reference to be reloaded from the one given on the
@@ -916,6 +947,11 @@ sub test_vcf_various
         cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf");
     test_cmd($opts, %args, out => "vcf_meta_meta.vcf",
         cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf_meta_meta.vcf");
+
+    # VCF file with contig IDX=1, simulating an edited BCF file
+    # See htslib issue 1534
+    test_cmd($opts, %args, out => "modhdr.expected.vcf",
+        cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2");
 }
 
 sub write_multiblock_bgzf {
diff --git a/test/test_bgzf.c b/test/test_bgzf.c
index 90ec167ac..a5084e6c6 100644
--- a/test/test_bgzf.c
+++ b/test/test_bgzf.c
@@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE.
 
 #include "../htslib/bgzf.h"
 #include "../htslib/hfile.h"
+#include "../htslib/hts_log.h"
 #include "../hfile_internal.h"
 
 const char *bgzf_suffix = ".gz";
@@ -159,13 +160,19 @@ static BGZF * try_bgzf_hopen(const char *name, const char *mode,
     return bgz;
 }
 
-static int try_bgzf_close(BGZF **bgz, const char *name, const char *func) {
+static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int expected_fail) {
     BGZF *to_close = *bgz;
     *bgz = NULL;
     if (bgzf_close(to_close) != 0) {
-        fprintf(stderr, "%s : bgzf_close failed on %s : %s\n",
-                func, name, strerror(errno));
+        if (!expected_fail)
+            fprintf(stderr, "%s : bgzf_close failed on %s%s%s\n",
+                    func, name,
+                    errno ? " : " : "",
+                    errno ? strerror(errno) : "");
         return -1;
+    } else if (expected_fail) {
+        fprintf(stderr, "%s : bgzf_close worked on %s, but expected failure\n",
+                func, name);
     }
     return 0;
 }
@@ -398,6 +405,7 @@ static int test_read(Files *f) {
     ssize_t bg_got, f_got;
     unsigned char bg_buf[BUFSZ], f_buf[BUFSZ];
 
+    errno = 0;
     bgz = try_bgzf_open(f->src_bgzf, "r", __func__);
     if (!bgz) return -1;
 
@@ -414,7 +422,7 @@ static int test_read(Files *f) {
         }
     } while (bg_got > 0 && f_got > 0);
 
-    if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) return -1;
+    if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) return -1;
     if (try_fseek_start(f->f_plain, f->src_plain, __func__) != 0) return -1;
 
     return 0;
@@ -449,7 +457,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method,
     bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__);
     if (bg_put < 0) goto fail;
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     switch (method) {
     case USE_BGZF_DOPEN:
@@ -491,7 +499,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method,
         goto fail;
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     return 0;
 
@@ -521,7 +529,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) {
     bg_put = try_bgzf_write(bgz, f->text, half, f->tmp_bgzf, __func__);
     if (bg_put < 0) goto fail;
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
 
     // Write second half.  Append mode, so an EOF block should be in the
@@ -535,7 +543,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) {
                             __func__);
     if (bg_put < 0) goto fail;
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     // Try reading
     pos = 0;
@@ -564,7 +572,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) {
         goto fail;
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     return 0;
 
@@ -601,7 +609,7 @@ static int test_index_load_dump(Files *f) {
     } while (got_src > 0 && got_dest > 0);
     if (try_fclose(&fdest, f->tmp_idx, __func__) != 0) goto fail;
 
-    if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) goto fail;
 
     return 0;
 
@@ -624,7 +632,7 @@ static int test_check_EOF(char *name, int expected) {
         return -1;
     }
 
-    return try_bgzf_close(&bgz, name, __func__);
+    return try_bgzf_close(&bgz, name, __func__, 0);
 }
 
 static int test_index_useek_getc(Files *f, const char *mode,
@@ -651,7 +659,7 @@ static int test_index_useek_getc(Files *f, const char *mode,
         }
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
     if (!bgz) goto fail;
@@ -710,7 +718,7 @@ static int test_index_useek_getc(Files *f, const char *mode,
         }
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     return 0;
 
@@ -741,7 +749,7 @@ static int test_tell_seek_getc(Files *f, const char *mode,
         if (bg_put < 0) goto fail;
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
     if (!bgz) goto fail;
@@ -811,7 +819,7 @@ static int test_tell_seek_getc(Files *f, const char *mode,
         }
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     return 0;
 
@@ -841,7 +849,7 @@ static int test_tell_read(Files *f, const char *mode) {
         if (bg_put < 0) goto fail;
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
     if (!bgz) goto fail;
@@ -859,7 +867,7 @@ static int test_tell_read(Files *f, const char *mode) {
         }
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
     free(bg_buf);
     return 0;
 
@@ -885,11 +893,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) {
     bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__);
     if (bg_put < 0) goto fail;
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
 
     bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
     if (!bgz) goto fail;
 
+    if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail;
+
     for (pos = 0; pos < f->ltext; ) {
         const char *end = strchr(text + pos, '\n');
         size_t l = end ? end - (text + pos) : f->ltext - pos;
@@ -909,12 +919,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) {
                     "Got      : %.*s\n",
                     __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos,
                     (int) str.l, str.s);
+            goto fail;
         }
 
         pos += l + 1;
     }
 
-    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
     free(ks_release(&str));
     return 0;
 
@@ -924,6 +935,98 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) {
     return -1;
 }
 
+static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int nthreads) {
+    BGZF* bgz = NULL;
+    ssize_t bg_put;
+    size_t pos;
+    kstring_t str = { 0, 0, NULL };
+    const char *text = (const char *) f->text;
+
+    // Turn off bgzf errors as they're expected.
+    enum htsLogLevel lvl = hts_get_log_level();
+    hts_set_log_level(HTS_LOG_OFF);
+
+    bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__);
+    if (!bgz) goto fail;
+
+    if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail;
+
+    const char *text_line2 = strchr(text, '\n') + 1;
+    bg_put = try_bgzf_write(bgz, text, text_line2 - text, f->tmp_bgzf, __func__);
+    if (bg_put < 0) goto fail;
+    if (bgzf_flush(bgz) < 0) goto fail;
+    int64_t block2_start = bgz->block_address;
+
+    const char *text_line3 = strchr(text_line2, '\n') + 1;
+    bg_put = try_bgzf_write(bgz, text_line2, text_line3 - text_line2, f->tmp_bgzf, __func__);
+    if (bg_put < 0) goto fail;
+    if (bgzf_flush(bgz) < 0) goto fail;
+    int64_t block3_start = bgz->block_address;
+
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
+
+    int64_t newsize;
+    for(newsize = block3_start - 1; newsize > block2_start; newsize--) {
+        //fprintf(stderr, "test_bgzf_getline_on_truncated_file : size truncated to %" PRId64 " with threads %d\n", newsize, nthreads);
+
+        if (truncate(f->tmp_bgzf, newsize) != 0) goto fail;
+
+        bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
+        if (!bgz) goto fail;
+
+        if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail;
+
+        for (pos = 0; pos < f->ltext; ) {
+            const char *end = strchr(text + pos, '\n');
+            size_t l = end ? end - (text + pos) : f->ltext - pos;
+
+            int res = bgzf_getline(bgz, '\n', &str);
+            if (res < -1) {
+                // ok, we expect error from truncated file
+                break;
+            } else if (res == -1) {
+                // truncated file should never return EOF since we do not truncate at block boundary
+                fprintf(stderr, "%s : %s from bgzf_getline on %s\n",
+                        __func__, "Unexpected EOF",
+                        f->tmp_bgzf);
+                goto fail;
+            }
+
+            if (str.l != l || memcmp(text + pos, str.s, l) != 0) {
+                fprintf(stderr,
+                        "%s : Unexpected data from bgzf_getline on %s\n"
+                        "Expected : %.*s\n"
+                        "Got      : %.*s\n",
+                        __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos,
+                        (int) str.l, str.s);
+                goto fail;
+            }
+            pos += l + 1;
+        }
+
+        // verify error is persistent
+        int k;
+        for(k = 0; k < 3; k++) {
+            int res = bgzf_getline(bgz, '\n', &str);
+            if (res > -2) {
+                fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res);
+                goto fail;
+            }
+        }
+        // closing a stream with error returns error
+        if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 1) == 0) goto fail;
+    }
+    free(ks_release(&str));
+    hts_set_log_level(lvl);
+    return 0;
+
+ fail:
+    hts_set_log_level(lvl);
+    if (bgz) bgzf_close(bgz);
+    free(ks_release(&str));
+    return -1;
+}
+
 int main(int argc, char **argv) {
     Files f = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0 };
     int retval = EXIT_FAILURE;
@@ -1000,6 +1103,10 @@ int main(int argc, char **argv) {
     if (test_bgzf_getline(&f, "w", 1) != 0) goto out;
     if (test_bgzf_getline(&f, "w", 2) != 0) goto out;
 
+    if (test_bgzf_getline_on_truncated_file(&f, "w", 0) != 0) goto out;
+    if (test_bgzf_getline_on_truncated_file(&f, "w", 1) != 0) goto out;
+    if (test_bgzf_getline_on_truncated_file(&f, "w", 2) != 0) goto out;
+
     retval = EXIT_SUCCESS;
 
  out:
diff --git a/test/test_faidx.c b/test/test_faidx.c
new file mode 100644
index 000000000..566149071
--- /dev/null
+++ b/test/test_faidx.c
@@ -0,0 +1,516 @@
+/* test/test_fadix.c -- Test faidx interfaces
+
+    Copyright (C) 2022 Genome Research Ltd.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include "../htslib/faidx.h"
+
+int file_compare(const char *file1, const char *file2) {
+    FILE *f1 = NULL;
+    FILE *f2 = NULL;
+    unsigned int lno = 1;
+    size_t got1, got2, i;
+    char buf1[1024], buf2[1024];
+    int ret = -1;
+
+    f1 = fopen(file1, "rb");
+    if (!f1) {
+        perror(file1);
+        goto out;
+    }
+    f2 = fopen(file2, "rb");
+    if (!f2) {
+        perror(file2);
+        goto out;
+    }
+
+    do {
+        got1 = fread(buf1, 1, sizeof(buf1), f1);
+        got2 = fread(buf2, 1, sizeof(buf2), f2);
+
+        for (i = 0; i < got1 && i < got2 && buf1[i] == buf2[i]; i++)
+            lno += (buf1[i] == '\n');
+        if (i < got1 || i < got2) {
+            fprintf(stderr, "%s and %s differ at line %u\n",
+                    file1, file2, lno);
+            goto out;
+        }
+    } while (got1 > 0 && got2 > 0);
+
+    if (ferror(f1)) {
+        perror(file1);
+        goto out;
+    }
+    if (ferror(f2)) {
+        perror(file2);
+        goto out;
+    }
+
+    if (got1 > 0 || got2 > 0) {
+        fprintf(stderr, "EOF on %s at line %u\n",
+                got1 ? file2 : file1, lno);
+        goto out;
+    }
+
+    ret = 0;
+ out:
+    if (f1) fclose(f1);
+    if (f2) fclose(f2);
+    return ret;
+}
+
+faidx_t * load_index(const char *fn, const char *fnfai, const char *fngzi,
+                     int flags, enum fai_format_options format) {
+    faidx_t *fai = fai_load3_format(fn, fnfai, fngzi, flags, format);
+    if (!fai) {
+        fprintf(stderr, "Failed: fai_load3(%s, %s, %s, %d, %d)\n",
+                fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL", flags,
+                (int) format);
+        return NULL;
+    }
+    return fai;
+}
+
+int do_retrieval(const char *fn, const char *fnfai, const char *fngzi,
+                 int flags, enum fai_format_options format, const char *fnout,
+                 const char *interface, int nreg, char **regions) {
+    int i, use_64bit = 1, use_parse_reg = 0, use_adjust_reg = 0;
+    faidx_t *fai = NULL;
+    FILE *out = stdout;
+
+    if (interface) {
+        if (strcmp(interface, "fai_fetch") == 0) {
+            use_64bit = 0;
+        } else if (strcmp(interface, "faidx_fetch_seq") == 0) {
+            use_64bit = 0;
+            use_parse_reg = 1;
+        } else if (strcmp(interface, "faidx_fetch_seq64") == 0
+                   || strcmp(interface, "fai_parse_region") == 0) {
+            use_parse_reg = 1;
+        } else if (strcmp(interface, "fai_adjust_region") == 0) {
+            use_parse_reg = 1;
+            use_adjust_reg = 1;
+        }
+    }
+
+    if (fnout) {
+        out = fopen(fnout, "wb");
+        if (!out) {
+            perror(fnout);
+            return -1;
+        }
+    }
+
+    fai = load_index(fn, fnfai, fngzi, flags, format);
+    if (!fai)
+        goto fail;
+
+    for (i = 0; i < nreg; i++) {
+        hts_pos_t len = 0, pos, beg = 0, end = 0;
+        int tid = 0;
+        char *seq = NULL;
+        size_t l;
+
+        if (use_parse_reg) {
+            const char *e = fai_parse_region(fai, regions[i],
+                                             &tid, &beg, &end, 0);
+            if (e == NULL) {
+                fprintf(stderr, "Failed: "
+                        "fai_parse_region(fai, %s, &tid, &beg, &end, 0)\n",
+                        regions[i]);
+                goto fail;
+            }
+            if (use_adjust_reg) {
+                hts_pos_t orig_beg = beg, orig_end = end;
+                int r = fai_adjust_region(fai, tid, &beg, &end);
+                if (r < 0
+                    || (((r & 1) != 0) ^ (beg != orig_beg))
+                    || (((r & 2) != 0) ^ (end != orig_end))) {
+                    fprintf(stderr, "Failed: fai_adjust_region(fai, %d, "
+                            "%"PRIhts_pos", %"PRIhts_pos") returned %d\n"
+                            "After: beg = %"PRIhts_pos" end = %"PRIhts_pos"\n",
+                            tid, orig_beg, orig_end, r, beg, end);
+                    goto fail;
+                }
+            }
+            if (use_64bit) {
+                seq = faidx_fetch_seq64(fai, faidx_iseq(fai, tid),
+                                        beg, end - 1, &len);
+            } else {
+                int ilen = 0;
+                seq = faidx_fetch_seq(fai, faidx_iseq(fai, tid),
+                                      beg, end - 1, &ilen);
+                len = ilen;
+            }
+            if (!seq) {
+                fprintf(stderr, "Failed: faidx_fetch_seq%s(fai, %s, "
+                        "%"PRIhts_pos", %"PRIhts_pos", &len)\n",
+                        use_64bit ? "64" : "", faidx_iseq(fai, tid), beg, end);
+                goto fail;
+            }
+        } else {
+            if (use_64bit) {
+                seq = fai_fetch64(fai, regions[i], &len);
+            } else {
+                int ilen = 0;
+                seq = fai_fetch(fai, regions[i], &ilen);
+                len = ilen;
+            }
+            if (!seq) {
+                fprintf(stderr, "Failed: fai_fetch%s(fai, %s, &len)\n",
+                        use_64bit ? "64" : "", regions[i]);
+                goto fail;
+            }
+        }
+
+        l = strlen(seq);
+        fprintf(out, "%c%s length: %"PRIhts_pos"\n",
+                format == FAI_FASTQ ? '@' : '>', regions[i], len);
+        for (pos = 0; pos < l; pos += 50) {
+            fprintf(out, "%.*s\n", 50, seq + pos);
+        }
+        free(seq);
+        if (format == FAI_FASTQ) {
+            hts_pos_t qual_len = 0;
+            char *qual;
+            if (use_parse_reg) {
+                if (use_64bit) {
+                    qual = faidx_fetch_qual64(fai, faidx_iseq(fai, tid),
+                                              beg, end - 1, &qual_len);
+                } else {
+                    int ilen = 0;
+                    qual = faidx_fetch_qual(fai, faidx_iseq(fai, tid),
+                                            beg, end - 1, &ilen);
+                    qual_len = ilen;
+                }
+            } else {
+                if (use_64bit) {
+                    qual = fai_fetchqual64(fai, regions[i], &qual_len);
+                } else {
+                    int ilen = 0;
+                    qual = fai_fetchqual(fai, regions[i], &ilen);
+                    qual_len = ilen;
+                }
+                if (!qual) {
+                    fprintf(stderr, "Failed: fai_fetchqual64(fai, %s, &len)\n",
+                            regions[i]);
+                    goto fail;
+                }
+            }
+            if (qual_len != len) {
+                fprintf(stderr,
+                        "Sequence and quality lengths differ for %s %s\n",
+                        fn, regions[i]);
+                free(qual);
+                goto fail;
+            }
+            fprintf(out, "+\n");
+            l = strlen(qual);
+            for (pos = 0; pos < l; pos+=50) {
+                fprintf(out, "%.*s\n", 50, qual + pos);
+            }
+            free(qual);
+        }
+    }
+
+    fai_destroy(fai);
+
+    if (fnout) {
+        if (fclose(out) != 0) {
+            perror(fnout);
+            return -1;
+        }
+    }
+    return 0;
+
+ fail:
+    if (fai)
+        fai_destroy(fai);
+    if (fnout)
+        fclose(out);
+
+    return -1;
+}
+
+int test_fai_line_length(const char *fn, const char *fnfai, const char *fngzi,
+                         enum fai_format_options format, const char *expected,
+                         const char *reg) {
+    hts_pos_t found_len;
+    faidx_t *fai = NULL;
+
+    fai = load_index(fn, fnfai, fngzi, 0, format);
+    if (!fai)
+        return -1;
+
+    found_len = fai_line_length(fai, reg);
+    fai_destroy(fai);
+    if (expected) {
+        long long exp_len = strtoll(expected, NULL, 10);
+        if (found_len != exp_len) {
+            fprintf(stderr, "Unexpected result %"PRIhts_pos" from "
+                    "fai_line_length, expected %s\n", found_len, expected);
+            return -1;
+        }
+    } else {
+        printf("%"PRIhts_pos"\n", found_len);
+    }
+    return 0;
+}
+
+int test_faidx_has_seq(const char *fn, const char *fnfai, const char *fngzi,
+                       enum fai_format_options format, const char *expected,
+                       const char *seq) {
+    int res;
+    faidx_t *fai = NULL;
+
+    fai = load_index(fn, fnfai, fngzi, 0, format);
+    if (!fai)
+        return -1;
+
+    res = faidx_has_seq(fai, seq);
+    fai_destroy(fai);
+    if (expected) {
+        long exp_res = strtol(expected, NULL, 10);
+        if (res != exp_res) {
+            fprintf(stderr, "Unexpected result %d from faidx_has_seq(%s) "
+                    "expected %s\n", res, seq, expected);
+            return -1;
+        }
+    } else {
+        printf("%d\n", res);
+    }
+    return 0;
+}
+
+int test_faidx_iseq(const char *fn, const char *fnfai, const char *fngzi,
+                    enum fai_format_options format, const char *expected,
+                    const char *index) {
+    const char *found_name = NULL;
+    int idx = atoi(index);
+    faidx_t *fai = NULL;
+
+    fai = load_index(fn, fnfai, fngzi, 0, format);
+    if (!fai)
+        return -1;
+
+    found_name = faidx_iseq(fai, idx);
+
+    if (expected) {
+        if (!found_name || strcmp(found_name, expected) != 0) {
+            fprintf(stderr, "Unexpected result %s from faidx_iseq(fai, %d), "
+                    "expected %s\n", found_name ? found_name : "(null)",
+                    idx, expected);
+            fai_destroy(fai);
+            return -1;
+        }
+    } else {
+        printf("%s\n", found_name ? found_name : "(null)");
+    }
+
+    fai_destroy(fai);
+    return 0;
+}
+
+int test_faidx_seq_len(const char *fn, const char *fnfai, const char *fngzi,
+                       enum fai_format_options format, const char *expected,
+                       const char *seq) {
+    int found_len;
+    faidx_t *fai = NULL;
+
+    fai = load_index(fn, fnfai, fngzi, 0, format);
+    if (!fai)
+        return -1;
+
+    found_len = faidx_seq_len(fai, seq);
+    fai_destroy(fai);
+
+    if (expected) {
+        int exp_len = atoi(expected);
+        if (found_len != exp_len) {
+            fprintf(stderr, "Unexpected result %d from faidx_seq_len(fai, %s) "
+                    "expected %s\n", found_len, seq, expected);
+            return -1;
+        }
+    } else {
+        printf("%d\n", found_len);
+    }
+
+    return 0;
+}
+
+int test_faidx_seq_len64(const char *fn, const char *fnfai, const char *fngzi,
+                         enum fai_format_options format, const char *expected,
+                         const char *seq) {
+    hts_pos_t found_len;
+    faidx_t *fai = NULL;
+
+    fai = load_index(fn, fnfai, fngzi, 0, format);
+    if (!fai)
+        return -1;
+
+    found_len = faidx_seq_len(fai, seq);
+    fai_destroy(fai);
+
+    if (expected) {
+        long long exp_len = strtoll(expected, NULL, 10);
+        if (found_len != exp_len) {
+            fprintf(stderr, "Unexpected result %"PRIhts_pos
+                    " from fai_seq_len64(fai, %s) expected %s\n",
+                    found_len, seq, expected);
+            return -1;
+        }
+    } else {
+        printf("%"PRIhts_pos"\n", found_len);
+    }
+
+    return 0;
+}
+
+void usage(FILE *out, const char *arg0) {
+    fprintf(out,
+            "Usage: %s [-c] -i fasta/q [-f fai_file] [-g gzi_file] [-e expected_fai]\n"
+            "       %s [-cQ] -i fasta/q [-f fai_file] [-g gzi_file] [region]\n"
+            "       %s -t FUNC -i fasta/q [-f fai_file] [-g gzi_file] [-e expected] <PARAM>\n"
+            "       %s -h\n",
+            arg0, arg0, arg0, arg0);
+}
+
+void help(FILE *out, const char *arg0) {
+    usage(out, arg0);
+    fprintf(out,
+            "Options:\n"
+            "  -i FILE      Input file\n"
+            "  -f FILE      Fasta/q index file name\n"
+            "  -g FILE      Bgzip index file name\n"
+            "  -o FILE      Output file name\n"
+            "  -e FILE|STR  Expected output\n"
+            "  -c           Set FAI_CREATE flag\n"
+            "  -Q           Output fastq format\n"
+            "  -t FUNC      Test function\n"
+            "  -h           Print this help\n"
+            "\n"
+            "Expected output is compared to the FAI file in indexing mode;"
+            " the output file\n"
+            "in retrieval mode; "
+            "expected output for various -t function tests.\n"
+            "\n"
+            "Unit tests (-t option):\n"
+            "   fai_line_length, faidx_has_seq, faidx_iseq, faidx_seq_len, faidx_seq_len64\n"
+            "In retrieval mode, -t can change the functions used to fetch data:\n"
+            "   fai_fetch, fai_fetch64, faidx_fetch_seq, faidx_fetch_seq64,\n"
+            "   fai_parse_region, fai_adjust_region\n"
+            "\n");
+}
+
+int main(int argc, char **argv) {
+    int opt;
+    const char *fn = NULL;
+    const char *fnout = NULL;
+    const char *fnfai = NULL;
+    const char *fngzi = NULL;
+    const char *expected = NULL;
+    const char *func = "";
+    int flags = 0;
+    enum fai_format_options format = FAI_FASTA;
+    int res;
+
+    while ((opt = getopt(argc, argv, "i:f:g:o:e:t:cQh")) > 0) {
+        switch (opt) {
+        case 'i':
+            fn = optarg;
+            break;
+        case 'f':
+            fnfai = optarg;
+            break;
+        case 'g':
+            fngzi = optarg;
+            break;
+        case 'o':
+            fnout = optarg;
+            break;
+        case 'e':
+            expected = optarg;
+            break;
+        case 'c':
+            flags |= FAI_CREATE;
+            break;
+        case 'Q':
+            format = FAI_FASTQ;
+            break;
+        case 't':
+            func = optarg;
+            break;
+        case 'h':
+            help(stdout, argv[0]);
+            return EXIT_SUCCESS;
+        default:
+            usage(stderr, argv[0]);
+            return EXIT_FAILURE;
+        }
+    }
+
+    if (!fn) {
+        usage(stderr, argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    if (optind == argc) {
+        // Index building mode
+        res = fai_build3(fn, fnfai, fngzi);
+        if (res) {
+            fprintf(stderr, "Failed: fai_build3(%s, %s, %s)\n",
+                    fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL");
+        } else if (expected) {
+            res = file_compare(fnfai, expected);
+        }
+    } else {
+        if (strcmp(func, "fai_line_length") == 0) {
+            res = test_fai_line_length(fn, fnfai, fngzi, format, expected,
+                                       argv[optind]);
+        } else if (strcmp(func, "faidx_has_seq") == 0) {
+            res = test_faidx_has_seq(fn, fnfai, fngzi, format, expected,
+                                     argv[optind]);
+        } else if (strcmp(func, "faidx_iseq") == 0) {
+            res = test_faidx_iseq(fn, fnfai, fngzi, format, expected,
+                                  argv[optind]);
+        } else if (strcmp(func, "faidx_seq_len") == 0) {
+            res = test_faidx_seq_len(fn, fnfai, fngzi, format, expected,
+                                     argv[optind]);
+        } else if (strcmp(func, "faidx_seq_len64") == 0) {
+            res = test_faidx_seq_len64(fn, fnfai, fngzi, format, expected,
+                                       argv[optind]);
+        } else {
+            res =  do_retrieval(fn, fnfai, fngzi, flags, format, fnout,
+                                func, argc - optind, &argv[optind]);
+            if (res == 0 && fnout && expected) {
+                res = file_compare(fnout, expected);
+            }
+        }
+    }
+    return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/test_view.c b/test/test_view.c
index 416ffe98d..f33c1cdf0 100644
--- a/test/test_view.c
+++ b/test/test_view.c
@@ -224,7 +224,8 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in,
             hts_itr_t *iter;
             if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) {
                 fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
-                continue;
+                exit_code = 1;
+                break;
             }
             while ((r = bcf_itr_next(in, iter, b)) >= 0) {
                 if (!opts->benchmark && bcf_write1(out, h, b) < 0) {
diff --git a/vcf.c b/vcf.c
index 56af63054..59d433c19 100644
--- a/vcf.c
+++ b/vcf.c
@@ -1,7 +1,7 @@
 /*  vcf.c -- VCF/BCF API functions.
 
     Copyright (C) 2012, 2013 Broad Institute.
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
     Portions copyright (C) 2014 Intel Corporation.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -51,6 +51,10 @@ DEALINGS IN THE SOFTWARE.  */
 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
 typedef khash_t(vdict) vdict_t;
 
+KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
+typedef khash_t(hdict) hdict_t;
+
+
 #include "htslib/kseq.h"
 HTSLIB_EXPORT
 uint32_t bcf_float_missing    = 0x7F800001;
@@ -79,6 +83,22 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N
 #define BCF_IS_64BIT (1<<30)
 
 
+// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
+// Note that this preserving API and ABI requires that the first element is vdict_t struct
+// rather than a pointer, as user programs may (and in some cases do) access the dictionary
+// directly as (vdict_t*)hdr->dict.
+typedef struct
+{
+    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
+    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
+}
+bcf_hdr_aux_t;
+
+static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
+{
+    return (bcf_hdr_aux_t *)hdr->dict[0];
+}
+
 static char *find_chrom_header_line(char *s)
 {
     char *nl;
@@ -93,9 +113,6 @@ static char *find_chrom_header_line(char *s)
 
 static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
 {
-    if ( !s ) return 0;
-    if (len == 0) len = strlen(s);
-
     const char *ss = s;
     while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
     if ( !*ss || ss - s == len)
@@ -140,7 +157,12 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
 
 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
 {
-    return bcf_hdr_add_sample_len(h, s, 0);
+    if (!s) {
+        // Allowed for backwards-compatibility, calling with s == NULL
+        // used to trigger bcf_hdr_sync(h);
+        return 0;
+    }
+    return bcf_hdr_add_sample_len(h, s, strlen(s));
 }
 
 int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
@@ -864,8 +886,45 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
     return 1;
 }
 
+int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
+{
+    // currently only for bcf_hdr_set_version
+    assert( hrec->type==BCF_HL_GEN );
+    int ret;
+    khint_t k;
+    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
+    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
+    {
+        if ( !kh_exist(aux->gen,k) ) continue;
+        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
+        break;
+    }
+    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
+    free((char*)kh_key(aux->gen,k));
+    kh_del(hdict,aux->gen,k);
+    kstring_t str = {0,0,0};
+    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
+    {
+        free(str.s);
+        return -1;
+    }
+    k = kh_put(hdict, aux->gen, str.s, &ret);
+    if ( ret<0 )
+    {
+        free(str.s);
+        return -1;
+    }
+    free(hrec->value);
+    hrec->value = strdup(tmp->value);
+    if ( !hrec->value ) return -1;
+    return 0;
+}
+
 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
 {
+    kstring_t str = {0,0,0};
+    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
+
     int res;
     if ( !hrec ) return 0;
 
@@ -883,19 +942,49 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
         }
 
         // Is one of the generic fields and already present?
-        int i;
-        for (i=0; i<hdr->nhrec; i++)
+        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
+        {
+            free(str.s);
+            return -1;
+        }
+        khint_t k = kh_get(hdict, aux->gen, str.s);
+        if ( k != kh_end(aux->gen) )
+        {
+            // duplicate record
+            bcf_hrec_destroy(hrec);
+            free(str.s);
+            return 0;
+        }
+    }
+
+    int i;
+    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
+    {
+        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
         {
-            if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
-            if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
-            if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
+            free(str.s);
+            return -1;
         }
-        if ( i<hdr->nhrec )
+        khint_t k = kh_get(hdict, aux->gen, str.s);
+        if ( k != kh_end(aux->gen) )
         {
+            // duplicate record
             bcf_hrec_destroy(hrec);
+            free(str.s);
             return 0;
         }
     }
+    if ( str.s )
+    {
+        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
+        if ( res<0 )
+        {
+            bcf_hrec_destroy(hrec);
+            free(str.s);
+            return -1;
+        }
+        kh_val(aux->gen,k) = hrec;
+    }
 
     // New record, needs to be added
     int n = hdr->nhrec + 1;
@@ -909,29 +998,47 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
     return hrec->type==BCF_HL_GEN ? 0 : 1;
 }
 
-/*
- *  Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
- *  the STR,GEN lines are searched for linearly in a linked list of all header lines.
- *  This may become a problem for VCFs with huge headers, we might need to build a
- *  dictionary for these lines as well.
- */
 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
 {
     int i;
     if ( type==BCF_HL_GEN )
     {
+        // e.g. ##fileformat=VCFv4.2
+        //      ##source=GenomicsDBImport
+        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
+        if ( value )
+        {
+            kstring_t str = {0,0,0};
+            ksprintf(&str, "##%s=%s", key,value);
+            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
+            khint_t k = kh_get(hdict, aux->gen, str.s);
+            free(str.s);
+            if ( k == kh_end(aux->gen) ) return NULL;
+            return kh_val(aux->gen, k);
+        }
         for (i=0; i<hdr->nhrec; i++)
         {
             if ( hdr->hrec[i]->type!=type ) continue;
             if ( strcmp(hdr->hrec[i]->key,key) ) continue;
-            if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
+            return hdr->hrec[i];
         }
         return NULL;
     }
     else if ( type==BCF_HL_STR )
     {
-        if (!str_class)
-            return NULL;
+        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
+        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
+        if (!str_class) return NULL;
+        if ( !strcmp("ID",key) )
+        {
+            kstring_t str = {0,0,0};
+            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
+            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
+            khint_t k = kh_get(hdict, aux->gen, str.s);
+            free(str.s);
+            if ( k == kh_end(aux->gen) ) return NULL;
+            return kh_val(aux->gen, k);
+        }
         for (i=0; i<hdr->nhrec; i++)
         {
             if ( hdr->hrec[i]->type!=type ) continue;
@@ -1068,6 +1175,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
     bcf_hrec_t *hrec;
     if ( !key )
     {
+        // no key, remove all entries of this type
         while ( i<hdr->nhrec )
         {
             if ( hdr->hrec[i]->type!=type ) { i++; continue; }
@@ -1183,14 +1291,19 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
     {
         int len;
         kstring_t str = {0,0,0};
-        ksprintf(&str,"##fileformat=%s", version);
+        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
         hrec = bcf_hdr_parse_line(hdr, str.s, &len);
         free(str.s);
     }
     else
     {
-        free(hrec->value);
-        hrec->value = strdup(version);
+        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
+        if ( !tmp ) return -1;
+        free(tmp->value);
+        tmp->value = strdup(version);
+        if ( !tmp->value ) return -1;
+        bcf_hdr_update_hrec(hdr, hrec, tmp);
+        bcf_hrec_destroy(tmp);
     }
     hdr->dirty = 1;
     return 0; // FIXME: check for errs in this function (return < 0 if so)
@@ -1204,6 +1317,14 @@ bcf_hdr_t *bcf_hdr_init(const char *mode)
     if (!h) return NULL;
     for (i = 0; i < 3; ++i)
         if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
+
+    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
+    if ( !aux ) goto fail;
+    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
+    aux->dict = *((vdict_t*)h->dict[0]);
+    free(h->dict[0]);
+    h->dict[0] = aux;
+
     if ( strchr(mode,'w') )
     {
         bcf_hdr_append(h, "##fileformat=VCFv4.2");
@@ -1229,6 +1350,13 @@ void bcf_hdr_destroy(bcf_hdr_t *h)
         if (d == 0) continue;
         for (k = kh_begin(d); k != kh_end(d); ++k)
             if (kh_exist(d, k)) free((char*)kh_key(d, k));
+        if ( i==0 )
+        {
+            bcf_hdr_aux_t *aux = get_hdr_aux(h);
+            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
+                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
+            kh_destroy(hdict, aux->gen);
+        }
         kh_destroy(vdict, d);
         free(h->id[i]);
     }
@@ -1974,7 +2102,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
         // header.  At this point, the header must have been printed,
         // proceeding would lead to a broken BCF file. Errors must be checked
         // and cleared by the caller before we can proceed.
-        hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1);
+        char errdescription[1024] = "";
+        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
         return -1;
     }
     bcf1_sync(v);   // check if the BCF record was modified
@@ -2211,20 +2340,44 @@ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
 {
     vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
-    int tid, m = kh_size(d);
+    int i, tid, m = kh_size(d);
     const char **names = (const char**) calloc(m,sizeof(const char*));
+    if ( !names )
+    {
+        hts_log_error("Failed to allocate memory");
+        *n = 0;
+        return NULL;
+    }
     khint_t k;
     for (k=kh_begin(d); k<kh_end(d); k++)
     {
         if ( !kh_exist(d,k) ) continue;
+        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
         tid = kh_val(d,k).id;
-        assert( tid<m );
+        if ( tid >= m )
+        {
+            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
+            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
+            {
+                hts_log_error("Failed to allocate memory");
+                *n = 0;
+                free(names);
+                return NULL;
+            }
+            m = tid + 1;
+        }
         names[tid] = kh_key(d,k);
     }
-    // sanity check: there should be no gaps
-    for (tid=0; tid<m; tid++)
-        assert(names[tid]);
-    *n = m;
+    // ensure there are no gaps
+    for (i=0,tid=0; tid<m; i++,tid++)
+    {
+        while ( tid<m && !names[tid] ) tid++;
+        if ( tid==m ) break;
+        if ( i==tid ) continue;
+        names[i] = names[tid];
+        names[tid] = 0;
+    }
+    *n = i;
     return names;
 }
 
@@ -2485,7 +2638,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
             if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
 
             k = kh_get(vdict, d, t);
-            v->errcode = BCF_ERR_TAG_UNDEF;
+            v->errcode |= BCF_ERR_TAG_UNDEF;
             if (res || k == kh_end(d)) {
                 hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_TAG_INVALID;
@@ -2915,7 +3068,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
             if (res < 0) bcf_hrec_destroy(hrec);
             if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
             k = kh_get(vdict, d, key);
-            v->errcode = BCF_ERR_TAG_UNDEF;
+            v->errcode |= BCF_ERR_TAG_UNDEF;
             if (res || k == kh_end(d)) {
                 hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_TAG_INVALID;
@@ -3410,7 +3563,7 @@ int vcf_write_line(htsFile *fp, kstring_t *line)
 
 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
 {
-    int ret;
+    ssize_t ret;
     fp->line.l = 0;
     if (vcf_format1(h, v, &fp->line) != 0)
         return -1;
@@ -3815,7 +3968,8 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
     int i;
     if ( line->errcode )
     {
-        hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1);
+        char errordescription[1024] = "";
+        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
         exit(1);
     }
     if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
@@ -5047,3 +5201,75 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v
     #undef BRANCH
     return nsmpl*fmt->n;
 }
+
+//error description structure definition
+typedef struct err_desc {
+    int  errorcode;
+    const char *description;
+}err_desc;
+
+// error descriptions
+static const err_desc errdesc_bcf[] = {
+    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
+    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
+    { BCF_ERR_NCOLS, "Incorrect number of columns" },
+    { BCF_ERR_LIMITS, "Limits reached" },
+    { BCF_ERR_CHAR, "Invalid character" },
+    { BCF_ERR_CTG_INVALID, "Invalid contig" },
+    { BCF_ERR_TAG_INVALID, "Invalid tag" },
+};
+
+/// append given description to buffer based on available size and add ... when not enough space
+    /** @param buffer       buffer to which description to be appended
+        @param offset       offset at which to be appended
+        @param maxbuffer    maximum size of the buffer
+        @param description  the description to be appended
+on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
+on success returns 0
+    */
+static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
+
+    if (!description || !buffer || !offset || (maxbuffer < 4))
+        return -1;
+
+    size_t rembuffer = maxbuffer - *offset;
+    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
+        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
+    } else {    //not enough space for description, put ...
+        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
+        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
+        return -1;
+    }
+    return 0;
+}
+
+//get description for given error code. return NULL on error
+const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
+    size_t usedup = 0;
+    int ret = 0;
+    int idx;
+
+    if (!buffer || maxbuffer < 4)
+        return NULL;           //invalid / insufficient buffer
+
+    if (!errorcode) {
+        buffer[0] = '\0';      //no error, set null
+        return buffer;
+    }
+
+    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
+        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
+            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
+            if (ret < 0)
+                break;         //not enough space, ... added, no need to continue
+
+            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
+        }
+    }
+
+    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
+        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
+    }
+    return buffer;
+}
+
diff --git a/version.sh b/version.sh
index 89d57fcf6..65d1ccae6 100755
--- a/version.sh
+++ b/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.16
+VERSION=1.17
 
 # If we have a git clone, then check against the current tag
 srcdir=${0%/version.sh}