diff --git a/.cirrus.yml b/.cirrus.yml index 3a7b910a5..dc93b071d 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,8 @@ libdeflate_template: &LIBDEFLATE pushd "$HOME" git clone --depth 1 https://github.com/ebiggers/libdeflate.git pushd libdeflate - make -j 4 CFLAGS='-fPIC -O3' libdeflate.a + cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS='-g -O3 -fPIC' + cmake --build build --verbose popd popd fi @@ -27,7 +28,7 @@ compile_template: &COMPILE compile_script: | git submodule update --init --recursive if test "x$USE_LIBDEFLATE" = "xyes"; then - CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate/build" --with-libdeflate' else CONFIG_OPTS='--without-libdeflate' fi @@ -75,6 +76,13 @@ gcc_task: CFLAGS: -std=c99 -pedantic -Wformat=2 USE_LIBDEFLATE: yes + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + cmake + << : *LIBDEFLATE << : *COMPILE << : *TEST @@ -129,6 +137,7 @@ rocky_task: LC_ALL: C CIRRUS_CLONE_DEPTH: 1 USE_CONFIG: yes + CFLAGS: -std=gnu90 # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. @@ -172,8 +181,8 @@ arm_ubuntu_task: macosx_task: name: macosx + clang - osx_instance: - image: monterey-base + macos_instance: + image: ghcr.io/cirruslabs/macos-ventura-base:latest environment: CC: clang @@ -187,8 +196,9 @@ macosx_task: USE_CONFIG: yes USE_LIBDEFLATE: yes - package_install_script: - - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git + package_install_script: | + HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \ + cmake << : *LIBDEFLATE << : *COMPILE diff --git a/.gitattributes b/.gitattributes index a14bb82b1..5d9850bc7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -20,3 +20,7 @@ README.md export-ignore # Remove the text attribute from index_dos.sam, so that the line separators # for the test file don't get converted into Unix format. test/index_dos.sam -text + +# Remove the text attribute from various faidx test files +test/faidx/faidx*.fa* -text +test/faidx/fastqs*.fq* -text diff --git a/.gitignore b/.gitignore index 6b58e8439..1dafc3615 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,8 @@ shlib-exports-*.txt /bgzip /htsfile /tabix +/test/faidx/*.tmp* +/test/faidx/FAIL* /test/fieldarith /test/hfile /test/hts_endian @@ -59,6 +61,7 @@ shlib-exports-*.txt /test/test-bcf_set_variant_type /test/test_bgzf /test/test_expr +/test/test_faidx /test/test_index /test/test_introspection /test/test_kfunc diff --git a/INSTALL b/INSTALL index 2f6deacb2..dd2c3ec90 100644 --- a/INSTALL +++ b/INSTALL @@ -110,6 +110,7 @@ The 'make install' command installs the libraries, library header files, utilities, several manual pages, and a pkgconfig file to /usr/local. The installation location can be changed by configuring with --prefix=DIR or via 'make prefix=DIR install' (see Installation Locations below). +Shared library permissions can be set via e.g. 'make install LIB_PERM=755'. Configuration @@ -291,3 +292,13 @@ mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools mingw-w64-x86_64-tools-git (The last is only needed for building libraries compatible with MSVC.) + +HP-UX +----- + +HP-UX requires that shared libraries have execute permission. The +default for HTSlib is to install with permission 644 (read-write for +owner and read-only for group / other). This can be overridden by +setting the LIB_PERM variable at install time with: + + make install LIB_PERM=755 diff --git a/LICENSE b/LICENSE index cf2e9f82a..925d47b40 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2022 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2022 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Makefile b/Makefile index 374141898..3e95a0bef 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2022 Genome Research Ltd. +# Copyright (C) 2013-2023 Genome Research Ltd. # # Author: John Marshall # @@ -39,6 +39,7 @@ CFLAGS = -g -Wall -O2 -fvisibility=hidden EXTRA_CFLAGS_PIC = -fpic TARGET_CFLAGS = LDFLAGS = -fvisibility=hidden +VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) prefix = /usr/local @@ -58,7 +59,8 @@ MKDIR_P = mkdir -p INSTALL = install -p INSTALL_DATA = $(INSTALL) -m 644 INSTALL_DIR = $(MKDIR_P) -m 755 -INSTALL_LIB = $(INSTALL_DATA) +LIB_PERM = 644 +INSTALL_LIB = $(INSTALL) -m $(LIB_PERM) INSTALL_MAN = $(INSTALL_DATA) INSTALL_PROGRAM = $(INSTALL) @@ -80,6 +82,7 @@ BUILT_TEST_PROGRAMS = \ test/sam \ test/test_bgzf \ test/test_expr \ + test/test_faidx \ test/test_kfunc \ test/test_kstring \ test/test_mod \ @@ -140,8 +143,8 @@ LIBHTS_SOVERSION = 3 # is not strictly necessary and should be removed the next time # LIBHTS_SOVERSION is bumped (see #1144 and # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) -MACH_O_COMPATIBILITY_VERSION = 3.1.16 -MACH_O_CURRENT_VERSION = 3.1.16 +MACH_O_COMPATIBILITY_VERSION = 3.1.17 +MACH_O_CURRENT_VERSION = 3.1.17 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -160,12 +163,20 @@ show-version: @echo PACKAGE_VERSION = $(PACKAGE_VERSION) @echo NUMERIC_VERSION = $(NUMERIC_VERSION) +config_vars.h: override escape=$(subst ',\x27,$(subst ",\",$(subst \,\\,$(1)))) +config_vars.h: override hts_cc_escaped=$(call escape,$(CC)) +config_vars.h: override hts_cppflags_escaped=$(call escape,$(CPPFLAGS)) +config_vars.h: override hts_cflags_escaped=$(call escape,$(CFLAGS)) +config_vars.h: override hts_ldflags_escaped=$(call escape,$(LDFLAGS)) +config_vars.h: override hts_libs_escaped=$(call escape,$(LIBS)) + config_vars.h: - echo '#define HTS_CC "$(CC)"' > $@ - echo '#define HTS_CPPFLAGS "$(CPPFLAGS)"' >> $@ - echo '#define HTS_CFLAGS "$(CFLAGS)"' >> $@ - echo '#define HTS_LDFLAGS "$(LDFLAGS)"' >> $@ - echo '#define HTS_LIBS "$(LIBS)"' >> $@ + printf '#define HTS_CC "%s"\n#define HTS_CPPFLAGS "%s"\n#define HTS_CFLAGS "%s"\n#define HTS_LDFLAGS "%s"\n#define HTS_LIBS "%s"\n' \ + '$(hts_cc_escaped)' \ + '$(hts_cppflags_escaped)' \ + '$(hts_cflags_escaped)' \ + '$(hts_ldflags_escaped)' \ + '$(hts_libs_escaped)' > $@ .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so @@ -344,7 +355,7 @@ print-config: # file used at runtime (when $LD_LIBRARY_PATH includes the build directory). libhts.so: $(LIBHTS_OBJS:.o=.pico) - $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread + $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(VERSION_SCRIPT_LDFLAGS) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread ln -sf $@ libhts.so.$(LIBHTS_SOVERSION) # Similarly this also creates libhts.NN.dylib as a byproduct, so that programs @@ -494,7 +505,7 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread -bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) +bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) @@ -583,12 +594,13 @@ check test: all $(HTSCODECS_TEST_TARGETS) fi test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam + cd test/faidx && ./test-faidx.sh faidx.tst cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst cd test/fastq && ./test-fastq.sh cd test/base_mods && ./base-mods.sh base-mods.tst - REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq + REF_PATH=: test/sam test/ce.fa test/faidx/faidx.fa test/faidx/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} @@ -622,6 +634,9 @@ test/test_bgzf: test/test_bgzf.o libhts.a test/test_expr: test/test_expr.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread +test/test_faidx: test/test_faidx.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread + test/test_kfunc: test/test_kfunc.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread @@ -739,6 +754,7 @@ test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_reg test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) @@ -789,7 +805,7 @@ header-exports.txt: test/header_syms.pl htslib/*.h test/header_syms.pl htslib/*.h | sort -u -o $@ shlib-exports-so.txt: libhts.so - nm -D -g libhts.so | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ + nm -D -g libhts.so | awk '$$2 == "T" { sub("@.*", "", $$3); print $$3 }' | sort -u -o $@ shlib-exports-dylib.txt: libhts.dylib nm -Ug libhts.dylib | awk '$$2 == "T" { sub("^_", "", $$3); print $$3 }' | sort -u -o $@ @@ -797,6 +813,31 @@ shlib-exports-dylib.txt: libhts.dylib shlib-exports-dll.txt: hts.dll.a nm -g hts.dll.a | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ +$(srcprefix)htslib.map: libhts.so + LC_ALL=C ; export LC_ALL; \ + curr_vers=`expr 'X$(PACKAGE_VERSION)' : 'X\([0-9]*\.[0-9.]*\)'` ; \ + last_vers=`awk '/^HTSLIB_[0-9](\.[0-9]+)+/ { lv = $$1 } END { print lv }' htslib.map` ; \ + if test "x$$curr_vers" = 'x' || test "x$$last_vers" = 'x' ; then \ + echo "Version check failed : $$curr_vers / $$las_vers" 1>&2 ; \ + exit 1 ; \ + fi && \ + if test "HTSLIB_$$curr_vers" = "$$last_vers" ; then \ + echo "Refusing to update $@ - HTSlib version not changed" 1>&2 ; \ + exit 1 ; \ + fi && \ + nm --with-symbol-versions -D -g libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print " " $$3 }' > $@.tmp && \ + if [ -s $@.tmp ] ; then \ + cat $@ > $@.new.tmp && \ + printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \ + cat $@.tmp >> $@.new.tmp && \ + printf '} %s;\n' "$$last_vers" >> $@.new.tmp && \ + rm -f $@.tmp && \ + mv $@.new.tmp $@ ; \ + fi ; \ + else \ + rm -f $@.tmp ; \ + fi + install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi @@ -845,7 +886,9 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \ + test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \ + header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out # Only remove this in git checkouts diff --git a/NEWS b/NEWS index 7ae851a47..f6c3de965 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,128 @@ +Noteworthy changes in release 1.17 (21st February 2023) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* A new API for iterating through a BAM record's aux field. + (PR#1354, addresses #1319. Thanks to John Marshall) + +* Text mode for bgzip. Allows bgzip to compress lines of text with block breaks + at newlines. + (PR#1493, thanks to Mike Lin for the initial version PR#1369) + +* Make tabix support CSI indices with large positions. Unlike SAM and VCF + files, BED files do not set a maximum reference length which hindered CSI + support. This change sets an arbitrary large size of 100G to enable it to + work. + (PR#1506) + +* Add a fai_line_length function. Exposes the internal line-wrap length. + (PR#1516) + +* Check for invalid barcode tags in fastq output. + (PR#1518, fixes samtools#1728. Reported by Poshi) + +* Warn if reference found in a CRAM file is not contained in the specified + reference file. + (PR#1517 and PR#1521, adds diagnostics for #1515. Reported by Wei WeiDeng) + +* Add a faidx_seq_len64 function that can return sequence lengths longer than + INT_MAX. At the same time limit faidx_seq_len to INT_MAX output. Also add a + fai_adjust_region to ensure given ranges do not go beyond the end of the + requested sequence. + (PR#1519) + +* Add a bcf_strerror function to give text descriptions of BCF errors. + (PR#1510) + +* Add CRAM SQ/M5 header checking when specifying a fasta file. This is to + prevent creating a CRAM that cannot be decoded again. + (PR#1522. In response to samtools#1748 though not a direct fix) + +* Improve support for very long input lines (> 2Gbyte). This is mostly useful + for tabix which does not do much interpretation of its input. + (PR#1542, a partial fix for #1539) + +* Speed up load_ref_portion. This function has been sped up by about 7x, which + speeds up low-depth CRAM decoding by about 10%. + (PR#1551) + +* Expand CRAM API to cope with new samtools cram_size command. + (PR#1546) + +* Merges neighbouring I and D ops into one op within pileup. This means + 4M1D1D1D3M is reported as 4M3D3M. Fixing this in sam.c means not only is + samtools mpileup now looking better, but any tool using the mpileup API will + be getting consistent results. + (PR#1552, fixes the last remaining part of samtools#139) + +* Update the API documentation for bgzf_mt as it refered to a previous + iteration. + (PR#1556, fixes #1553. Reported by Raghavendra Padmanabhan) + + +Build changes +------------- + +* Use POSIX grep in testing as egrep and fgrep are considered obsolete. + (PR#1509, thanks to David Seifert) + +* Switch to building libdefalte with cmake for Cirris CI. + (PR#1511) + +* Ensure strings in config_vars.h are escaped correctly. + (PR#1530, fixes #1527. Reported by Lucas Czech) + +* Easier modification of shared library permissions during install. + (PR#1532, fixes #1525. Reported by StephDC) + +* Fix build on ancient compilers. Added -std=gnu90 to build tests so older + C compilers will still be happy. + (PR#1524, fixes #1523. Reported by Martin Jakt) + +* Switch MacOS CI tests to an ARM-based image. + (PR#1536) + +* Cut down the number of embed_ref=2 tests that get run. + (PR#1537) + +* Add symbol versions to libhts.so. This is to aid package developers. + (PR#1560 addresses #1505, thanks to John Marshall. Reported by Stefan Bruens) + +* htscodecs now updated to v1.4.0. + (PR#1563) + +* Cleaned up misleading system error reports in test_bgzf. + (PR#1565) + +Bug fixes +--------- + +* VCF. Fix n-squared complexity in sample line with many adjacent tabs [fuzz]. + (PR#1503) + +* Improved bcftools detection and reporting of bgzf decode errors. + (PR#1504, thanks to Lilian Janin. PR#1529 thanks to Bergur Ragnarsson, fixes + #1528. PR#1554) + +* Prevent crash when the only FASTA entry has no sequence [fuzz]. + (PR#1507) + +* Fixed typo in sam.h documentation. + (PR#1512, thanks to kojix2) + +* Fix buffer read-overrun in bam_plp_insertion_mod. + (PR#1520) + +* Fix hash keys being left behind by bcf_hdr_remove. + (PR#1535, fixes #1533. Reported by Giulio Genovese in #842) + +* Make bcf_hdr_idinfo_exists more robust by checking id value exists. + (PR#1544, fixes #1538. Reported by Giulio Genovese) + +* CRAM improvements. Fixed crash with multi-threaded CRAM. Fixed a bug in the + codec parameter learning for CRAM 3.1 name tokeniser. Fixed Cram compression + container substitution matrix generation, + (PR#1558, PR#1559 and PR#1562) + Noteworthy changes in release 1.16 (18th August 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bgzf.c b/bgzf.c index a969b1567..468289106 100644 --- a/bgzf.c +++ b/bgzf.c @@ -989,6 +989,8 @@ int bgzf_read_block(BGZF *fp) { hts_tpool_result *r; + if (fp->errcode) return -1; + if (fp->mt) { again: if (fp->mt->hit_eof) { @@ -1005,11 +1007,14 @@ int bgzf_read_block(BGZF *fp) if (fp->uncompressed_block == NULL) return -1; fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; } // else it's already allocated with malloc, maybe even in-use. - if (mt_destroy(fp->mt) < 0) + if (mt_destroy(fp->mt) < 0) { fp->errcode = BGZF_ERR_IO; + } fp->mt = NULL; hts_tpool_delete_result(r, 0); - + if (fp->errcode) { + return -1; + } goto single_threaded; } @@ -1018,6 +1023,7 @@ int bgzf_read_block(BGZF *fp) hts_log_error("BGZF decode jobs returned error %d " "for block offset %"PRId64, j->errcode, j->block_address); + hts_tpool_delete_result(r, 0); return -1; } @@ -1479,6 +1485,8 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) int64_t block_address; block_address = htell(fp->fp); + j->block_address = block_address; // in case we exit with j->errcode + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; count = hpeek(fp->fp, header, sizeof(header)); if (count == 0) // no data read @@ -2285,11 +2293,12 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) fp->block_length = 0; } } while (state == 0); + if (state < -1) return state; if (str->l == 0 && state < 0) return state; fp->uncompressed_address += str->l + 1; if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--; str->s[str->l] = 0; - return str->l; + return str->l <= INT_MAX ? (int) str->l : INT_MAX; } void bgzf_index_destroy(BGZF *fp) diff --git a/bgzip.1 b/bgzip.1 index 1ada36630..abf9fcedb 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,10 +1,10 @@ -.TH bgzip 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" +.TH bgzip 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2018, 2021 Genome Research Limited. +.\" Copyright (C) 2018, 2021-2022 Genome Research Limited. .\" .\" Author: Heng Li .\" @@ -74,6 +74,17 @@ after decompression completes the input file will be removed. .SH OPTIONS .TP 10 +.B "--binary" +Bgzip will attempt to ensure BGZF blocks end on a newline when the +input is a text file. The exception to this is where a single line is +larger than a BGZF block (64Kb). This can aid tools that use the +index to perform random access on the compressed stream, as the start +of a block is likely to also be the start of a text record. + +This option processes text files as if they were binary content, +ignoring the location of newlines. This also restores the behaviour +for text files to bgzip version 1.15 and earlier. +.TP .BI "-b, --offset " INT Decompress to standard output from virtual file position (0-based uncompressed offset). @@ -92,7 +103,8 @@ Use \fB--force\fR twice to do both without asking. .TP .B "-g, --rebgzip" Try to use an existing index to create a compressed file with matching -block offsets. +block offsets. The index must be specified using the \fB-I +\fIfile.gzi\fR option. Note that this assumes that the same compression library and level are in use as when making the original file. Don't use it unless you know what you're doing. @@ -181,6 +193,5 @@ The BGZF library was originally implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR gzip (1), -.BR tabix (1) +.IR gzip (1), +.IR tabix (1) diff --git a/bgzip.c b/bgzip.c index bd0374811..589f79f66 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2019, 2021 Genome Research Ltd. + Copyright (C) 2010, 2013-2019, 2021-2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,13 +36,14 @@ #include #include "htslib/bgzf.h" #include "htslib/hts.h" +#include "htslib/hfile.h" #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN # include #endif -static const int WINDOW_SIZE = 64 * 1024; +static const int WINDOW_SIZE = BGZF_BLOCK_SIZE; static void error(const char *format, ...) { @@ -121,15 +122,16 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -r, --reindex (re)index compressed file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); fprintf(fp, " -t, --test test integrity of compressed file\n"); + fprintf(fp, " --binary Don't align blocks with text lines\n"); fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); return status; } int main(int argc, char **argv) { - int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep; + int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary; BGZF *fp; - void *buffer; + char *buffer; long start, end, size; char *index_fname = NULL; int threads = 1; @@ -151,10 +153,11 @@ int main(int argc, char **argv) {"test", no_argument, NULL, 't'}, {"version", no_argument, NULL, 1}, {"keep", no_argument, NULL, 'k'}, + {"binary", no_argument, NULL, 2}, {NULL, 0, NULL, 0} }; - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0; while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; @@ -173,8 +176,9 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; + case 2: binary = 1; break; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); } @@ -185,7 +189,7 @@ int main(int argc, char **argv) return 1; } if (compress == 1) { - int f_src = fileno(stdin); + hFILE* f_src = NULL; char out_mode[3] = "w\0"; char out_mode_exclusive[4] = "wx\0"; @@ -198,13 +202,13 @@ int main(int argc, char **argv) out_mode_exclusive[2] = compress_level + '0'; } + if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + if ( argc>optind ) { - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - if (pstdout) fp = bgzf_open("-", out_mode); else @@ -241,7 +245,7 @@ int main(int argc, char **argv) if ( rebgzip && !index_fname ) { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); + fprintf(stderr, "[bgzip] Index file name expected when writing to stdout. See -I option.\n"); return 1; } @@ -250,18 +254,103 @@ int main(int argc, char **argv) bgzf_mt(fp, threads, 256); buffer = malloc(WINDOW_SIZE); -#ifdef _WIN32 - _setmode(f_src, O_BINARY); -#endif + if (!buffer) + return 1; if (rebgzip){ if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } else { - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + htsFormat fmt; + int textual = 0; + if (!binary + && hts_detect_format(f_src, &fmt) == 0 + && fmt.compression == no_compression) { + switch(fmt.format) { + case text_format: + case sam: + case vcf: + case bed: + case fasta_format: + case fastq_format: + case fai_format: + case fqi_format: + textual = 1; + break; + default: break; // silence clang warnings + } + } + + if (binary || !textual) { + // Binary data, either detected or explicit + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) + error("Could not write %d bytes: Error %d\n", + c, fp->errcode); + } else { + /* Text mode, try a flush after a newline */ + int in_header = 1, n = 0, long_line = 0; + while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { + int c2 = c+n; + int flush = 0; + if (in_header && + (long_line || buffer[0] == '@' || buffer[0] == '#')) { + // Scan forward to find the last header line. + int last_start = 0; + n = 0; + while (n < c2) { + if (buffer[n++] != '\n') + continue; + + last_start = n; + if (n < c2 && + !(buffer[n] == '@' || buffer[n] == '#')) { + in_header = 0; + break; + } + } + if (!last_start) { + n = c2; + long_line = 1; + } else { + n = last_start; + flush = 1; + long_line = 0; + } + } else { + // Scan backwards to find the last newline. + n += c; // c read plus previous n overflow + while (--n >= 0 && ((char *)buffer)[n] != '\n') + ; + + if (n >= 0) { + flush = 1; + n++; + } else { + n = c2; + } + } + + // Pos n is either at the end of the buffer with flush==0, + // or the first byte after a newline and a flush point. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + if (flush) + if (bgzf_flush_try(fp, 65536) < 0) // force + return -1; + + memmove(buffer, buffer+n, c2-n); + n = c2-n; + } + + // Trailing data. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + } } if ( index ) { @@ -270,13 +359,16 @@ int main(int argc, char **argv) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) - error("Could not write index to '%s.gz.gzi'", argv[optind]); + error("Could not write index to '%s.gz.gzi'\n", + argv[optind]); } } - if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); + if (bgzf_close(fp) < 0) + error("Output close failed: Error %d\n", fp->errcode); + if (hclose(f_src) < 0) + error("Input close failed\n"); if (argc > optind && !pstdout && !keep) unlink(argv[optind]); free(buffer); - close(f_src); return 0; } else if ( reindex ) diff --git a/config.mk.in b/config.mk.in index 82af49850..7341a170d 100644 --- a/config.mk.in +++ b/config.mk.in @@ -1,6 +1,6 @@ # Optional configure Makefile overrides for htslib. # -# Copyright (C) 2015-2017, 2019 Genome Research Ltd. +# Copyright (C) 2015-2017, 2019, 2023 Genome Research Ltd. # # Author: John Marshall # @@ -43,6 +43,7 @@ RANLIB = @RANLIB@ CPPFLAGS = @CPPFLAGS@ CFLAGS = @CFLAGS@ LDFLAGS = @LDFLAGS@ +VERSION_SCRIPT_LDFLAGS = @VERSION_SCRIPT_LDFLAGS@ LIBS = @LIBS@ PLATFORM = @PLATFORM@ diff --git a/configure.ac b/configure.ac index 7d40948a6..98b0a44c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2022 Genome Research Ltd. +# Copyright (C) 2015-2023 Genome Research Ltd. # # Author: John Marshall # @@ -155,6 +155,11 @@ static_LDFLAGS=$LDFLAGS static_LIBS='-lpthread -lz -lm' private_LIBS=$LDFLAGS +AC_ARG_ENABLE([versioned-symbols], + [AS_HELP_STRING([--disable-versioned-symbols], + [disable versioned symbols in shared library])], + [], [enable_versioned_symbols=yes]) + AC_ARG_ENABLE([bz2], [AS_HELP_STRING([--disable-bz2], [omit support for BZ2-compressed CRAM files])], @@ -249,6 +254,24 @@ esac AC_MSG_RESULT([$host_result]) AC_SUBST([PLATFORM]) +dnl Check for versioned symbol support +dnl Only try for .so shared libraries as other types won't work +AS_IF([test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes], + [AC_CACHE_CHECK([whether the linker supports versioned symbols], + [hts_cv_have_versioned_symbols], [ + save_LDFLAGS=$LDFLAGS + LDFLAGS="-Wl,-version-script,$srcdir/htslib.map $LDFLAGS" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [hts_cv_have_versioned_symbols=yes], + [hts_cv_have_versioned_symbols=no]) + LDFLAGS=$save_LDFLAGS + ]) + AS_IF([test "x$hts_cv_have_versioned_symbols" = xyes],[ + VERSION_SCRIPT_LDFLAGS='-Wl,-version-script,$(srcprefix)htslib.map' + AC_SUBST([VERSION_SCRIPT_LDFLAGS]) + ]) +]) + dnl Try to get more control over which symbols are exported in the shared dnl library. HTS_HIDE_DYNAMIC_SYMBOLS diff --git a/cram/README b/cram/README new file mode 100644 index 000000000..135438227 --- /dev/null +++ b/cram/README @@ -0,0 +1,214 @@ +CRAM encoding internals +======================= + +A quick summary of functions involved. + +The encoder works by accumulating a bunch of BAM records (via the +cram_put_bam_seq function), and at a certain point (eg counter of +records, or switching reference) the array of BAM records it turned +into a container, which in turn creates slices, holding CRAM +data-series in blocks. The function that turns an array of BAM +objects into the container is below. + +cram_encode_container func: + Validate references MD5 against header, unless no_ref mode + If embed_ref <= 1, fetch ref + Switch to embed_ref=2 if failed + + Foreach slice: + If embed_ref == 2 + call cram_generate_reference + if failed switch to no_ref mode + Foreach sequence + call process_one_read to append BAM onto each data series (DS) + call cram_stats_add for each DS to gather metrics + call cram_encode_aux + + # We now have cram DS, per slice + call cram_encoder_init, per DS (based on cram_stats_add data) + + Foreach slice: + call cram_encode_slice to turn DS to blocks + call cram_compess_slice + + call cram_encode_compression_header + +Threading +--------- + +CRAM can be multi-threaded, but this brings complications. + +The above function is the main CPU user, so it is this bit which can +be executed in parallel from multiple threads. To understand this we +need to now look at how the primary loop works when writing a CRAM: + +Encoding main thread: + repeatedly calls cram_put_bam_seq + calls cram_new_container on first time through to initialise + calls cram_next_container when current is full or we need to flush + calls cram_flush_container_mt to flush last container + pushes BAM object onto current container + +If non-threaded, cram_flush_container_mt does: + call cram_flush_container + call cram_encode_container to go from BAM to CRAM data-series + call cram_flush_container2 (writes it out) + +If threaded, cram_flush_container_mt does: + Main: Dispatch cram_flush_thread job + Thread: call cram_encode_container to go from BAM to CRAM data-series + Main: Call cram_flush_result to drain queue of encoded containers + Main: Call cram_flush_container2 (writes it out); + + + +Decisions on when to create new containers, detection of sorted vs unsorted, +switching to multi-seq mode, etc occur at the main thread in +cram_put_bam_seq. + +We can change our mind on container parameters at any point up until +the cram_encode_container call. At that point these parameters get +baked into a container compression header and all data-series +generated need to be in sync with the parameters. + +It is possible that some parameter changes can get detected while +encoding the container, as it is there where we fetch references. Eg +the need to enable embedded reference or switch to non-ref mode. + +While encoding a container, we can change the parameters for *this* +container, and we can also set the default parameter for subsequent +new parameters via the global cram fd to avoid spamming attempts to +load a reference which doesn't exist, but we cannot change other +containers that are being processed in parallel. They'll fend for +themselves. + +References +---------- + +To avoid spamming the reference servers, there is a shared cache of +references being currently used by all the worker threads (leading to +confusing terminology of reference-counting of references). So each +container fetches its section of reference, but the memory for that is +handled via its own layer. + +The shared references and ref meta-data is held in cram_fd -> refs (a +refs_t pointer): + + // References structure. + struct refs_t { + string_alloc_t *pool; // String pool for holding filenames and SN vals + + khash_t(refs) *h_meta; // ref_entry*, index by name + ref_entry **ref_id; // ref_entry*, index by ID + int nref; // number of ref_entry + + char *fn; // current file opened + BGZF *fp; // and the hFILE* to go with it. + + int count; // how many cram_fd sharing this refs struct + + pthread_mutex_t lock; // Mutex for multi-threaded updating + ref_entry *last; // Last queried sequence + int last_id; // Used in cram_ref_decr_locked to delay free + }; + +Within this, ref_entry is the per-reference information: + + typedef struct ref_entry { + char *name; + char *fn; + int64_t length; + int64_t offset; + int bases_per_line; + int line_length; + int64_t count; // for shared references so we know to dealloc seq + char *seq; + mFILE *mf; + int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; + } ref_entry; + +Sharing of references to track use between threads is via +cram_ref_incr* and cram_ref_decr* (which locked and unlocked +variants). We free a reference when the usage count hits zero. To +avoid spamming discard and reload in single-thread creation of a +pos-sorted CRAM, we keep track of the last reference in cram_fd and +delay discard by one loop iteration. + +There are complexities here around whether the references come from a +single ref.fa file, are from a local MD5sum cache with one file per +reference (mmapped), or whether they're fetched from some remote +REF_PATH query such as the EBI. (This later case typically downloads +to a local md5 based ref-cache first and mmaps from there.) + +The refs struct start off by being populated from the SAM header. We +have M5 tag and name known, maybe a filename, but length is 0 and seq +is NULL. This is done by cram_load_reference: + +cram_load_reference (cram_fd, filename): + if filename non-NULL + call refs_load_fai + Populates ref_entry with filename, name, length, line-len, etc + sanitise_SQ_lines + If no refs loaded + call refs_from_header + populates ref_entry with name. + Sets length=0 as marker for not-yet-loaded + +The main interface used from the code is cram_get_ref(). It takes a +reference ID, start and end coordinate and returns a pointer to the +relevant sub-sequence. + +cram_get_ref: + r = fd->refs->ref_id[id]; // current ref + call cram_populate_ref if stored length is 0 (ie ref.fa set) + search REF_PATH / REF_CACHE + call bgzf_open if local_path + call open_path_mfile otherwise + copy to local REF_CACHE if required (eg remote fetch) + + If start = 1 and end = ref-length + If ref seq unknown + call cram_ref_load to load entire ref and use that + + If ref seq now known, return it + + // Otherwise known via .fai or we've errored by now. + call load_ref_portion to return a sub-seq from index fasta + +The encoder asks for the entire reference rather than a small portion +of it as we're usually encoding a large amount. The decoder may be +dealing with small range queries, so it only asks for the relevant +sub-section of reference as specified in the cram slice headers. + + +TODO +==== + +- Multi-ref mode is enabled when we have too many small containers in + a row. + + Instead of firing off new containers when we switch reference, we + could always make a new container after N records, separating off + M <= N to make the container such that all M are the same reference, + and shuffling any remaining N-M down as the start of the next. + + This means we can detect how many new containers we would create, + and enable multi-ref mode straight away rather than keeping a recent + history of how many small containers we've emitted. + +- The cache of references currently being used is a better place to + track the global embed-ref and non-ref logic. Better than cram_fd. + Cram_fd is a one-way change, as once we enable non-ref we'll stick + with it. + + However if it was per-ref in the ref-cache then we'd probe and try + each reference once, and then all new containers for that ref would + honour the per-ref parameters. So a single missing reference in the + middle of a large file wouldn't change behaviour for all subsequence + references. + + Optionally we could still do meta-analysis on how many references + are failing, and switch the global cram_fd params to avoid repeated + testing of reference availability if it's becoming obvious that none + of them are known. diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 33e1b5bf8..21240c141 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021 Genome Research Ltd. +Copyright (c) 2012-2021,2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -447,6 +447,11 @@ cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { return cram_get_block_by_id(slice, c->u.external.content_id); } +int cram_external_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "EXTERNAL(id=%d)", + c->u.external.content_id) < 0 ? -1 : 0; +} + cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -494,6 +499,7 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, c->free = cram_external_decode_free; c->size = cram_external_decode_size; c->get_block = cram_external_get_block; + c->describe = cram_external_describe; c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL); @@ -739,6 +745,14 @@ cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) { return cram_get_block_by_id(slice, c->u.varint.content_id); } +int cram_varint_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)", + c->u.varint.content_id, + c->u.varint.offset, + c->u.varint.type) + < 0 ? -1 : 0; +} + cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -774,6 +788,7 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->free = cram_varint_decode_free; c->size = cram_varint_decode_size; c->get_block = cram_varint_get_block; + c->describe = cram_varint_describe; c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL); c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); @@ -942,6 +957,11 @@ int cram_const_decode_size(cram_slice *slice, cram_codec *c) { return 0; } +int cram_const_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "CONST(val=%"PRId64")", + c->u.xconst.val) < 0 ? -1 : 0; +} + cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -963,6 +983,7 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, c->free = cram_const_decode_free; c->size = cram_const_decode_size; c->get_block = NULL; + c->describe = cram_const_describe; c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL); @@ -1091,6 +1112,12 @@ void cram_beta_decode_free(cram_codec *c) { free(c); } +int cram_beta_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BETA(offset=%d, nbits=%d)", + c->u.beta.offset, c->u.beta.nbits) + < 0 ? -1 : 0; +} + cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -1115,6 +1142,7 @@ cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, return NULL; } c->free = cram_beta_decode_free; + c->describe = cram_beta_describe; c->u.beta.nbits = -1; c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL); @@ -1405,6 +1433,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xpack_decode_free; c->size = cram_xpack_decode_size; c->get_block = cram_xpack_get_block; + c->describe = NULL; c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); @@ -1735,6 +1764,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xdelta_decode_free; c->size = cram_xdelta_decode_size; c->get_block = cram_xdelta_get_block; + c->describe = NULL; c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL); c->u.xdelta.last = 0; @@ -2135,6 +2165,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xrle_decode_free; c->size = cram_xrle_decode_size; c->get_block = cram_xrle_get_block; + c->describe = NULL; c->u.xrle.cur_len = 0; c->u.xrle.cur_lit = -1; @@ -2423,6 +2454,13 @@ void cram_subexp_decode_free(cram_codec *c) { free(c); } +int cram_subexp_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "SUBEXP(offset=%d,k=%d)", + c->u.subexp.offset, + c->u.subexp.k) + < 0 ? -1 : 0; +} + cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -2442,6 +2480,7 @@ cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, c->codec = E_SUBEXP; c->decode = cram_subexp_decode; c->free = cram_subexp_decode_free; + c->describe = cram_subexp_describe; c->u.subexp.k = -1; c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL); @@ -2489,6 +2528,11 @@ void cram_gamma_decode_free(cram_codec *c) { free(c); } +int cram_gamma_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset) + < 0 ? -1 : 0; +} + cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -2511,6 +2555,7 @@ cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, c->codec = E_GAMMA; c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; + c->describe = cram_gamma_describe; c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL); @@ -2703,6 +2748,22 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, return 0; } +int cram_huffman_describe(cram_codec *c, kstring_t *ks) { + int r = 0, n; + r |= ksprintf(ks, "HUFFMAN(codes={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%"PRId64, n?",":"", + c->u.huffman.codes[n].symbol); + } + r |= ksprintf(ks, "},lengths={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%d", n?",":"", + c->u.huffman.codes[n].len); + } + r |= ksprintf(ks, "})") < 0; + return r; +} + /* * Initialises a huffman decoder from an encoding data stream. */ @@ -2865,6 +2926,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, } else { return NULL; } + h->describe = cram_huffman_describe; return (cram_codec *)h; @@ -3293,6 +3355,22 @@ void cram_byte_array_len_decode_free(cram_codec *c) { free(c); } +int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) { + int r = 0; + r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0; + cram_byte_array_len_decoder *l = &c->u.byte_array_len; + r |= l->len_codec->describe + ? l->len_codec->describe(l->len_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "},val_codec={") < 0; + r |= l->val_codec->describe + ? l->val_codec->describe(l->val_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "}") < 0; + + return r; +} + cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -3308,6 +3386,7 @@ cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, c->codec = E_BYTE_ARRAY_LEN; c->decode = cram_byte_array_len_decode; c->free = cram_byte_array_len_decode_free; + c->describe = cram_byte_array_len_describe; c->u.byte_array_len.len_codec = NULL; c->u.byte_array_len.val_codec = NULL; @@ -3532,6 +3611,13 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { free(c); } +int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)", + c->u.byte_array_stop.stop, + c->u.byte_array_stop.content_id) + < 0 ? -1 : 0; +} + cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -3561,6 +3647,7 @@ cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, return NULL; } c->free = cram_byte_array_stop_decode_free; + c->describe = cram_byte_array_stop_describe; c->u.byte_array_stop.stop = *cp++; if (CRAM_MAJOR_VERS(version) == 1) { @@ -4032,3 +4119,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return 0; } + +int cram_codec_describe(cram_codec *c, kstring_t *ks) { + if (c && c->describe) + return c->describe(c, ks); + else + return ksprintf(ks, "?"); +} diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 56b065255..d93d9955c 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2015, 2018, 2020 Genome Research Ltd. +Copyright (c) 2012-2015, 2018, 2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -160,7 +160,7 @@ typedef struct { /* * A generic codec structure. */ -typedef struct cram_codec { +struct cram_codec { enum cram_encoding codec; cram_block *out; varint_vec *vv; @@ -175,6 +175,7 @@ typedef struct cram_codec { int (*size)(cram_slice *slice, struct cram_codec *codec); int (*flush)(struct cram_codec *codec); cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec); + int (*describe)(struct cram_codec *codec, kstring_t *ks); union { cram_huffman_decoder huffman; @@ -201,7 +202,7 @@ typedef struct cram_codec { cram_const_codec e_xconst; cram_varint_decoder e_varint; } u; -} cram_codec; +}; const char *cram_encoding2str(enum cram_encoding t); diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 1f8d60f12..73f567106 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2438,8 +2438,9 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } } - if ((!s->ref && s->hdr->ref_base_id < 0) - || memcmp(digest, s->hdr->md5, 16) != 0) { + if (!c->comp_hdr->no_ref && + ((!s->ref && s->hdr->ref_base_id < 0) + || memcmp(digest, s->hdr->md5, 16) != 0)) { char M[33]; const char *rname = sam_hdr_tid2name(sh, ref_id); if (!rname) rname="?"; // cannot happen normally diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 1ba1988f4..5b56aedd5 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -61,7 +61,7 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, bam_seq_t *b, int rnum, kstring_t *MD, - int embed_ref); + int embed_ref, int no_ref); /* * Returns index of val into key. @@ -70,7 +70,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, static int sub_idx(char *key, char val) { int i; - for (i = 0; *key && *key++ != val; i++); + for (i = 0; i < 4 && *key++ != val; i++); return i; } @@ -87,6 +87,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); int i, mc, r = 0; + int no_ref = c->no_ref; + if (!cb || !map) return NULL; @@ -162,7 +164,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, kh_val(h->preservation_map, k).i = h->qs_seq_orient; } - if (fd->no_ref || embed_ref>0) { + if (no_ref || embed_ref>0) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); if (-1 == r) return NULL; @@ -203,31 +205,38 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, case CRAM_KEY('S','M'): { char smat[5], *mp = smat; + // Output format is for order ACGTN (minus ref base) + // to store the code value 0-3 for each symbol. + // + // Note this is different to storing the symbols in order + // that the codes occur from 0-3, which is what we used to + // do. (It didn't matter as we always had a fixed table in + // the order.) *mp++ = - (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | - (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | - (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | - (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); + (sub_idx(h->substitution_matrix[0], 'C') << 6) | + (sub_idx(h->substitution_matrix[0], 'G') << 4) | + (sub_idx(h->substitution_matrix[0], 'T') << 2) | + (sub_idx(h->substitution_matrix[0], 'N') << 0); *mp++ = - (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | - (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | - (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | - (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); + (sub_idx(h->substitution_matrix[1], 'A') << 6) | + (sub_idx(h->substitution_matrix[1], 'G') << 4) | + (sub_idx(h->substitution_matrix[1], 'T') << 2) | + (sub_idx(h->substitution_matrix[1], 'N') << 0); *mp++ = - (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | - (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | - (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | - (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); + (sub_idx(h->substitution_matrix[2], 'A') << 6) | + (sub_idx(h->substitution_matrix[2], 'C') << 4) | + (sub_idx(h->substitution_matrix[2], 'T') << 2) | + (sub_idx(h->substitution_matrix[2], 'N') << 0); *mp++ = - (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | - (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | - (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | - (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); + (sub_idx(h->substitution_matrix[3], 'A') << 6) | + (sub_idx(h->substitution_matrix[3], 'C') << 4) | + (sub_idx(h->substitution_matrix[3], 'G') << 2) | + (sub_idx(h->substitution_matrix[3], 'N') << 0); *mp++ = - (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | - (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | - (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | - (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); + (sub_idx(h->substitution_matrix[4], 'A') << 6) | + (sub_idx(h->substitution_matrix[4], 'C') << 4) | + (sub_idx(h->substitution_matrix[4], 'G') << 2) | + (sub_idx(h->substitution_matrix[4], 'T') << 0); BLOCK_APPEND(map, smat, 5); break; } @@ -1688,6 +1697,54 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { return -1; } +// Check if the SQ M5 tag matches the reference we've loaded. +static int validate_md5(cram_fd *fd, int ref_id) { + if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref) + return 0; + + // Have we already checked this ref? + if (fd->refs->ref_id[ref_id]->validated_md5) + return 0; + + // Check if we have the MD5 known. + // We should, but maybe we're using embedded references? + sam_hrecs_t *hrecs = fd->header->hrecs; + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN", + hrecs->ref[ref_id].name); + if (!ty) + return 0; + + sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL); + if (!m5tag) + return 0; + + // It's known, so compute md5 on the loaded reference sequence. + char *ref = fd->refs->ref_id[ref_id]->seq; + int64_t len = fd->refs->ref_id[ref_id]->length; + hts_md5_context *md5; + char unsigned buf[16]; + char buf2[33]; + + if (!(md5 = hts_md5_init())) + return -1; + hts_md5_update(md5, ref, len); + hts_md5_final(buf, md5); + hts_md5_destroy(md5); + hts_md5_hex(buf2, buf); + + // Compare it to header @SQ M5 tag + if (strcmp(m5tag->str+3, buf2)) { + hts_log_error("SQ header M5 tag discrepancy for reference '%s'", + hrecs->ref[ref_id].name); + hts_log_error("Please use the correct reference, or " + "consider using embed_ref=2"); + return -1; + } + fd->refs->ref_id[ref_id]->validated_md5 = 1; + + return 0; +} + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1698,7 +1755,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { cram_block_compression_hdr *h = c->comp_hdr; cram_block *c_hdr; int multi_ref = 0; - int r1, r2, sn, nref, embed_ref; + int r1, r2, sn, nref, embed_ref, no_ref; spare_bams *spares; if (CRAM_MAJOR_VERS(fd->version) == 1) @@ -1707,22 +1764,17 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} #define goto_err goto err + restart: /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; - embed_ref = fd->embed_ref; pthread_mutex_unlock(&fd->ref_lock); - - if (!fd->no_ref && c->refs_used) { - for (i = 0; i < nref; i++) { - if (c->refs_used[i]) - cram_get_ref(fd, i, 1, 0); - } - } + embed_ref = c->embed_ref; + no_ref = c->no_ref; /* To create M5 strings */ /* Fetch reference sequence */ - if (!fd->no_ref) { + if (!no_ref) { if (!c->bams || !c->bams[0]) goto_err; bam_seq_t *b = c->bams[0]; @@ -1730,7 +1782,20 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (embed_ref <= 1) { char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); if (!ref && bam_ref(b) >= 0) { - if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) { + if (!c->pos_sorted) { + // TODO: maybe also check fd->no_ref? + hts_log_warning("Failed to load reference #%d", + bam_ref(b)); + hts_log_warning("Switching to non-ref mode"); + + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; + } + + if (c->multi_seq || embed_ref == 0) { hts_log_error("Failed to load reference #%d", bam_ref(b)); return -1; } @@ -1741,9 +1806,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { hts_log_warning("NOTE: the CRAM file will be bigger than" " using an external reference"); pthread_mutex_lock(&fd->ref_lock); - embed_ref = fd->embed_ref = 2; + embed_ref = c->embed_ref = fd->embed_ref = 2; pthread_mutex_unlock(&fd->ref_lock); goto auto_ref; + } else if (ref) { + if (validate_md5(fd, c->ref_seq_id) < 0) + goto_err; } if ((c->ref_id = bam_ref(b)) >= 0) { c->ref_seq_id = c->ref_id; @@ -1768,6 +1836,21 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->ref_seq_id = c->ref_id; } + if (!no_ref && c->refs_used) { + for (i = 0; i < nref; i++) { + if (c->refs_used[i]) { + if (cram_get_ref(fd, i, 1, 0)) { + if (validate_md5(fd, i) < 0) + goto_err; + } else { + hts_log_warning("Failed to find reference, " + "switching to non-ref mode"); + no_ref = c->no_ref = 1; + } + } + } + } + /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; @@ -1791,8 +1874,27 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // Embed consensus / MD-generated ref if (embed_ref == 2) { if (cram_generate_reference(c, s, r1) < 0) { - hts_log_error("Failed to build reference"); - return -1; + // Should this be a permanent thing via fd->no_ref? + // Doing so means we cannot easily switch back again should + // things fix themselves later on. This is likely not a + // concern though as failure to generate a reference implies + // unsorted data which is rarely recovered from. + + // Only if sn == 0. We're hosed if we're on the 2nd slice and + // the first worked, as no-ref is a container global param. + if (sn > 0) { + hts_log_error("Failed to build reference, " + "switching to non-ref mode"); + return -1; + } else { + hts_log_warning("Failed to build reference, " + "switching to non-ref mode"); + } + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; } } @@ -1803,7 +1905,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { bam_seq_t *b = c->bams[r1]; /* If multi-ref we need to cope with changing reference per seq */ - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); @@ -1813,6 +1915,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { free(MD.s); return -1; } + if (validate_md5(fd, bam_ref(b)) < 0) + return -1; c->ref_seq_id = bam_ref(b); // overwritten later by -2 if (!fd->refs->ref_id[c->ref_seq_id]->seq) @@ -1823,7 +1927,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) { + if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref, + no_ref) != 0) { free(MD.s); return -1; } @@ -1890,7 +1995,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); } @@ -1922,12 +2027,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Compute MD5s */ + no_ref = c->no_ref; int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; + for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; if (CRAM_MAJOR_VERS(fd->version) != 1) { - if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { + if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) { hts_md5_context *md5 = hts_md5_init(); if (!md5) return -1; @@ -2304,7 +2411,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } /* Cache references up-front if we have unsorted access patterns */ - if (!fd->no_ref && c->refs_used) { + if (!no_ref && c->refs_used) { for (i = 0; i < fd->refs->nref; i++) { if (c->refs_used[i]) cram_ref_decr(fd->refs, i); @@ -2542,7 +2649,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_slice *s, cram_record *cr, int verbatim_NM, int verbatim_MD, int NM, kstring_t *MD, int cf_tag, - int *err) { + int no_ref, int *err) { char *aux, *orig; sam_hrec_rg_t *brg = NULL; int aux_size = bam_get_l_aux(b); @@ -2594,7 +2701,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, // MD:Z if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { while (*aux++); if (CRAM_MAJOR_VERS(fd->version) >= 4) @@ -2606,7 +2713,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, // NM:i if (aux[0] == 'N' && aux[1] == 'M') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { int NM_ = bam_aux2i((uint8_t *)aux+2); if (NM_ == NM) { switch(aux[2]) { @@ -3040,7 +3147,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { fd->slices_per_container); if (!c) return NULL; + + pthread_mutex_lock(&fd->ref_lock); + c->no_ref = fd->no_ref; + c->embed_ref = fd->embed_ref; c->record_counter = fd->record_counter; + pthread_mutex_unlock(&fd->ref_lock); c->curr_ref = bam_ref(b); } @@ -3087,7 +3199,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, bam_seq_t *b, int rnum, kstring_t *MD, - int embed_ref) { + int embed_ref, int no_ref) { int i, fake_qual = -1, NM = 0; char *cp; char *ref, *seq, *qual; @@ -3128,7 +3240,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Non reference based encoding means storing the bases verbatim as features, which in // turn means every base also has a quality already stored. - if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) + if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3) @@ -3206,7 +3318,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", // cig_len, &ref[apos], cig_len, &seq[spos]); l = 0; - if (!fd->no_ref && cr->len) { + if (!no_ref && cr->len) { int end = cig_len+apos < c->ref_end ? cig_len : c->ref_end - apos; char *sp = &seq[spos]; @@ -3301,7 +3413,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } if (l < cig_len && cr->len) { - if (fd->no_ref) { + if (no_ref) { if (CRAM_MAJOR_VERS(fd->version) == 3) { if (cram_add_bases(fd, c, s, cr, spos, cig_len-l, &seq[spos])) @@ -3360,7 +3472,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (cram_add_insertion(c, s, cr, spos, cig_len, cr->len ? &seq[spos] : NULL)) return -1; - if (fd->no_ref && cr->len) { + if (no_ref && cr->len) { for (l = 0; l < cig_len; l++, spos++) { cram_add_quality(fd, c, s, cr, spos, qual[spos]); } @@ -3376,7 +3488,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, fd->version)) return -1; - if (fd->no_ref && + if (no_ref && !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { if (cr->len) { for (l = 0; l < cig_len; l++, spos++) { @@ -3412,7 +3524,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, return -1; } fake_qual = spos; - cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end); + cr->aend = no_ref ? apos : MIN(apos, c->ref_end); if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0) goto block_err; @@ -3435,7 +3547,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, int err = 0; sam_hrec_rg_t *brg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, - cf_tag, &err); + cf_tag, no_ref, &err); if (err) goto block_err; @@ -3728,9 +3840,16 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!fd->ctr) return -1; fd->ctr->record_counter = fd->record_counter; + + pthread_mutex_lock(&fd->ref_lock); + fd->ctr->no_ref = fd->no_ref; + fd->ctr->embed_ref = fd->embed_ref; + pthread_mutex_unlock(&fd->ref_lock); } c = fd->ctr; + int embed_ref = c->embed_ref; + if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || (c->s_num_bases >= fd->bases_per_slice)) { @@ -3746,9 +3865,6 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * The multi_seq var here refers to our intention for the next slice. * This slice has already been encoded so we output as-is. */ - pthread_mutex_lock(&fd->ref_lock); - int embed_ref = fd->embed_ref; - pthread_mutex_unlock(&fd->ref_lock); if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && embed_ref<=0) { @@ -3773,7 +3889,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush - cram_free_container(fd->ctr); + fd->ctr_mt = fd->ctr; // delay free when threading fd->ctr = NULL; } return -1; @@ -3796,6 +3912,27 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->multi_seq = 1; c->pos_sorted = 0; + // Cram_next_container may end up flushing an existing one and + // triggering fd->embed_ref=2 if no reference is found. + // Embedded refs are incompatible with multi-seq, so we bail + // out and switch to no_ref in this scenario. We do this + // within the container only, as multi_seq may be temporary + // and we switch back away from it again. + pthread_mutex_lock(&fd->ref_lock); + if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) { + hts_log_warning("Changing from embed_ref to no_ref mode"); + // Should we update fd->embed_ref and no_ref here too? + // Doing so means if we go into multi-seq and back out + // again, eg due a cluster of tiny refs in the middle of + // much larger ones, then we bake in no-ref mode. + // + // However for unsorted data we're realistically not + // going to switch back. + c->embed_ref = fd->embed_ref = 0; // or -1 for auto? + c->no_ref = fd->no_ref = 1; + } + pthread_mutex_unlock(&fd->ref_lock); + if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); c->refs_used = calloc(fd->refs->nref, sizeof(int)); @@ -3821,8 +3958,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { } else if (c->refs_used && c->refs_used[bam_ref(b)]) { pthread_mutex_lock(&fd->ref_lock); fd->unsorted = 1; - pthread_mutex_unlock(&fd->ref_lock); fd->multi_seq = 1; + pthread_mutex_unlock(&fd->ref_lock); } } diff --git a/cram/cram_external.c b/cram/cram_external.c index e88ff838b..26ef3d7d3 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -40,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#endif #include "../htslib/hfile.h" #include "cram.h" @@ -82,6 +89,14 @@ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) { c->num_blocks = num_blocks; } +int32_t cram_container_get_num_records(cram_container *c) { + return c->num_records; +} + +int64_t cram_container_get_num_bases(cram_container *c) { + return c->num_bases; +} + /* Returns the landmarks[] array and the number of elements * in num_landmarks. @@ -180,6 +195,294 @@ int cram_block_compression_hdr_decoder2encoder(cram_fd *fd, return 0; } +typedef struct { + cram_block_compression_hdr *hdr; + cram_map *curr_map; + int idx; + int is_tag; // phase 2 using tag_encoding_map +} cram_codec_iter; + +static void cram_codec_iter_init(cram_block_compression_hdr *hdr, + cram_codec_iter *iter) { + iter->hdr = hdr; + iter->curr_map = NULL; + iter->idx = 0; + iter->is_tag = 0; +} + +// See enum cram_DS_ID in cram/cram_structs +static int cram_ds_to_key(enum cram_DS_ID ds) { + switch(ds) { + case DS_RN: return 256*'R'+'N'; + case DS_QS: return 256*'Q'+'S'; + case DS_IN: return 256*'I'+'N'; + case DS_SC: return 256*'S'+'C'; + case DS_BF: return 256*'B'+'F'; + case DS_CF: return 256*'C'+'F'; + case DS_AP: return 256*'A'+'P'; + case DS_RG: return 256*'R'+'G'; + case DS_MQ: return 256*'M'+'Q'; + case DS_NS: return 256*'N'+'S'; + case DS_MF: return 256*'M'+'F'; + case DS_TS: return 256*'T'+'S'; + case DS_NP: return 256*'N'+'P'; + case DS_NF: return 256*'N'+'F'; + case DS_RL: return 256*'R'+'L'; + case DS_FN: return 256*'F'+'N'; + case DS_FC: return 256*'F'+'C'; + case DS_FP: return 256*'F'+'P'; + case DS_DL: return 256*'D'+'L'; + case DS_BA: return 256*'B'+'A'; + case DS_BS: return 256*'B'+'S'; + case DS_TL: return 256*'T'+'L'; + case DS_RI: return 256*'R'+'I'; + case DS_RS: return 256*'R'+'S'; + case DS_PD: return 256*'P'+'D'; + case DS_HC: return 256*'H'+'C'; + case DS_BB: return 256*'B'+'B'; + case DS_QQ: return 256*'Q'+'Q'; + case DS_TN: return 256*'T'+'N'; + case DS_TC: return 256*'T'+'C'; + case DS_TM: return 256*'T'+'M'; + case DS_TV: return 256*'T'+'V'; + default: break; + } + + return -1; // unknown +} + +static cram_codec *cram_codec_iter_next(cram_codec_iter *iter, + int *key) { + cram_codec *cc = NULL; + cram_block_compression_hdr *hdr = iter->hdr; + + if (!iter->is_tag) { + // 1: Iterating through main data-series + do { + cc = hdr->codecs[iter->idx++]; + } while(!cc && iter->idx < DS_END); + if (cc) { + *key = cram_ds_to_key(iter->idx-1); + return cc; + } + + // Reset index for phase 2 + iter->idx = 0; + iter->is_tag = 1; + } + + do { + if (!iter->curr_map) + iter->curr_map = hdr->tag_encoding_map[iter->idx++]; + + cc = iter->curr_map ? iter->curr_map->codec : NULL; + if (cc) { + *key = iter->curr_map->key; + iter->curr_map = iter->curr_map->next; + return cc; + } + } while (iter->idx <= CRAM_MAP_HASH); + + // End of codecs + return NULL; +} + +/* + * A list of data-series, used to create a linked list threaded through + * a single array. + */ +typedef struct ds_list { + int data_series; + int next; +} ds_list; + +KHASH_MAP_INIT_INT(cid, int64_t) + +// Opaque struct for the CRAM block content-id -> data-series map. +struct cram_cid2ds_t { + ds_list *ds; // array of data-series with linked lists threading through it + int ds_size; + int ds_idx; + khash_t(cid) *hash; // key=content_id, value=index to ds array + int *ds_a; // serialised array of data-series returned by queries. +}; + +void cram_cid2ds_free(cram_cid2ds_t *cid2ds) { + if (cid2ds) { + if (cid2ds->hash) + kh_destroy(cid, cid2ds->hash); + free(cid2ds->ds); + free(cid2ds->ds_a); + free(cid2ds); + } +} + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds) { + cram_cid2ds_t *c2d = cid2ds; + if (!c2d) { + c2d = calloc(1, sizeof(*c2d)); + if (!c2d) + return NULL; + + c2d->hash = kh_init(cid); + if (!c2d->hash) + goto err; + } + + // Iterate through codecs. Initially primary two-left ones in + // rec_encoding_map, and then the three letter in tag_encoding_map. + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + // Having got a codec, we can then use cram_codec_to_id to get + // the block IDs utilised by that codec. This is then our + // map for allocating data blocks to data series, but for shared + // blocks we can't separate out how much is used by each DS. + int bnum[2]; + cram_codec_get_content_ids(codec, bnum); + + khiter_t k; + int ret, i; + for (i = 0; i < 2; i++) { + if (bnum[i] > -2) { + k = kh_put(cid, c2d->hash, bnum[i], &ret); + if (ret < 0) + goto err; + + if (c2d->ds_idx >= c2d->ds_size) { + c2d->ds_size += 100; + c2d->ds_size *= 2; + ds_list *ds_new = realloc(c2d->ds, + c2d->ds_size * sizeof(*ds_new)); + if (!ds_new) + goto err; + c2d->ds = ds_new; + } + + if (ret == 0) { + // Shared content_id, so add to list of DS + + // Maybe data-series should be part of the hash key? + // + // So top-32 bit is content-id, bot-32 bit is key. + // Sort hash by key and then can group all the data-series + // known together. ?? + // + // Brute force for now, scan to see if recorded. + // Typically this is minimal effort as we almost always + // have 1 data-series per block content-id, so the list to + // search is of size 1. + int dsi = kh_value(c2d->hash, k); + while (dsi >= 0) { + if (c2d->ds[dsi].data_series == key) + break; + dsi = c2d->ds[dsi].next; + } + + if (dsi == -1) { + // Block content_id seen before, but not with this DS + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = kh_value(c2d->hash, k); + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } else { + // First time this content id has been used + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = -1; + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } + } + } + + return c2d; + + err: + if (c2d != cid2ds) + cram_cid2ds_free(c2d); + return NULL; +} + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n) { + *n = 0; + if (!c2d || !c2d->hash) + return NULL; + + khiter_t k = kh_get(cid, c2d->hash, content_id); + if (k == kh_end(c2d->hash)) + return NULL; + + if (!c2d->ds_a) { + c2d->ds_a = malloc(c2d->ds_idx * sizeof(int)); + if (!c2d->ds_a) + return NULL; + } + + int dsi = kh_value(c2d->hash, k); // initial ds array index from hash + int idx = 0; + while (dsi >= 0) { + c2d->ds_a[idx++] = c2d->ds[dsi].data_series; + dsi = c2d->ds[dsi].next; // iterate over list within ds array + } + + *n = idx; + return c2d->ds_a; +} + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks) { + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key, r = 0; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + char key_s[4] = {0}; + int key_i = 0; + if (key>>16) key_s[key_i++] = key>>16; + key_s[key_i++] = (key>>8)&0xff; + key_s[key_i++] = key&0xff; + r |= ksprintf(ks, "\t%s\t", key_s) < 0; + r |= cram_codec_describe(codec, ks) < 0; + r |= kputc('\n', ks) < 0; + } + + return r ? -1 : 0; +} + /* *----------------------------------------------------------------------------- * cram_slice @@ -206,12 +509,17 @@ void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, *----------------------------------------------------------------------------- * cram_block */ -int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; } +int32_t cram_block_get_content_id(cram_block *b) { + return b->content_type == CORE ? -1 : b->content_id; +} int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; } int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; } int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; } void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); } int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); } +enum cram_block_method cram_block_get_method(cram_block *b) { + return (enum cram_block_method)b->orig_method; +} enum cram_content_type cram_block_get_content_type(cram_block *b) { return b->content_type; } @@ -236,6 +544,122 @@ void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); } size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); } void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; } +/* + * Given a compressed block of data in a specified compression method, + * fill out the 'cm' field with meta-data gleaned from the compressed + * block. + * + * If comp is CRAM_COMP_UNKNOWN, we attempt to auto-detect the compression + * format, but this doesn't work for all methods. + * + * Retuns the detected or specified comp method, and fills out *cm + * if non-NULL. + */ +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp) { + cram_method_details *cm = calloc(1, sizeof(*cm)); + if (!cm) + return NULL; + + const char *xz_header = "\xFD""7zXZ"; // including nul + + if (comp == CRAM_COMP_UNKNOWN) { + // Auto-detect + if (size > 1 && data[0] == 0x1f && data[1] == 0x8b) + comp = CRAM_COMP_GZIP; + else if (size > 3 && data[1] == 'B' && data[2] == 'Z' + && data[3] == 'h') + comp = CRAM_COMP_BZIP2; + else if (size > 6 && memcmp(xz_header, data, 6) == 0) + comp = CRAM_COMP_LZMA; + else + comp = CRAM_COMP_UNKNOWN; + } + cm->method = comp; + + // Interrogate the compressed data stream to fill out additional fields. + switch (comp) { + case CRAM_COMP_GZIP: + if (size > 8) { + if (data[8] == 4) + cm->level = 1; + else if (data[8] == 2) + cm->level = 9; + else + cm->level = 5; + } + break; + + case CRAM_COMP_BZIP2: + if (size > 3 && data[3] >= '1' && data[3] <= '9') + cm->level = data[3]-'0'; + break; + + case CRAM_COMP_RANS4x8: + cm->Nway = 4; + if (size > 0 && data[0] == 1) + cm->order = 1; + else + cm->order = 0; + break; + + case CRAM_COMP_RANSNx16: + if (size > 0) { + cm->order = data[0] & 1; + cm->Nway = data[0] & RANS_ORDER_X32 ? 32 : 4; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + } + break; + + case CRAM_COMP_ARITH: + if (size > 0) { + // Not in a public header, but the same transforms as rANSNx16 + cm->order = data[0] & 3; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + cm->ext = data[0] & 4 /*external*/ ? 1 : 0; + } + break; + + case CRAM_COMP_TOK3: + if (size > 8) { + if (data[8] == 1) + cm->level = 11; + else if (data[8] == 0) + cm->level = 1; + } + break; + + default: + break; + } + + return cm; +} + +/* + *----------------------------------------------------------------------------- + * cram_codecs + */ + +// -2 is unused. +// -1 is CORE +// >= 0 is the block with that Content ID +void cram_codec_get_content_ids(cram_codec *c, int ids[2]) { + ids[0] = cram_codec_to_id(c, &ids[1]); +} + +/* + *----------------------------------------------------------------------------- + * Utility functions + */ /* * Copies the blocks representing the next num_slice slices from a diff --git a/cram/cram_io.c b/cram/cram_io.c index 5d01e1318..d3c39e47a 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2022 Genome Research Ltd. +Copyright (c) 2012-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -2183,6 +2183,8 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; default: strat = 0; } metrics->strat = strat; @@ -2586,6 +2588,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { e->seq = NULL; e->mf = NULL; e->is_md5 = 0; + e->validated_md5 = 0; k = kh_put(refs, r->h_meta, e->name, &n); if (-1 == n) { @@ -3022,6 +3025,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { fd->refs->fp = fp; fd->refs->fn = r->fn; r->is_md5 = 1; + r->validated_md5 = 1; // Fall back to cram_get_ref() where it'll do the actual // reading of the file. @@ -3043,6 +3047,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } r->length = sz; r->is_md5 = 1; + r->validated_md5 = 1; } else { refs_t *refs; const char *fn; @@ -3198,7 +3203,7 @@ void cram_ref_decr(refs_t *r, int id) { } /* - * Used by cram_ref_load and cram_ref_get. The file handle will have + * Used by cram_ref_load and cram_get_ref. The file handle will have * already been opened, so we can catch it. The ref_entry *e informs us * of whether this is a multi-line fasta file or a raw MD5 style file. * Either way we create a single contiguous sequence. @@ -3216,6 +3221,10 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* * Compute locations in file. This is trivial for the MD5 files, but * is still necessary for the fasta variants. + * + * Note the offset here, as with faidx, has the assumption that white- + * space (the diff between line_length and bases_per_line) only occurs + * at the end of a line of text. */ offset = e->line_length ? e->offset + (start-1)/e->bases_per_line * e->line_length + @@ -3244,14 +3253,34 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* Strip white-space if required. */ if (len != end-start+1) { - int i, j; + hts_pos_t i, j; char *cp = seq; char *cp_to; + // Copy up to the first white-space, and then repeatedly just copy + // bases_per_line verbatim, and use the slow method to end again. + // + // This may seem excessive, but this code can be a significant + // portion of total CRAM decode CPU time for shallow data sets. for (i = j = 0; i < len; i++) { - if (cp[i] >= '!' && cp[i] <= '~') - cp[j++] = toupper_c(cp[i]); + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + else + break; + } + while (i < len && isspace_c(cp[i])) + i++; + while (i < len - e->line_length) { + hts_pos_t j_end = j + e->bases_per_line; + while (j < j_end) + cp[j++] = cp[i++] & ~0x20; // toupper equiv + i += e->line_length - e->bases_per_line; } + for (; i < len; i++) { + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + } + cp_to = cp+j; if (cp_to - seq != end-start+1) { @@ -3422,8 +3451,11 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { */ pthread_mutex_lock(&fd->refs->lock); if (r->length == 0) { + if (fd->ref_fn) + hts_log_warning("Reference file given, but ref '%s' not present", + r->name); if (cram_populate_ref(fd, id, r) == -1) { - hts_log_error("Failed to populate reference for id %d", id); + hts_log_warning("Failed to populate reference for id %d", id); pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); return NULL; @@ -3620,6 +3652,8 @@ cram_container *cram_new_container(int nrec, int nslice) { c->max_apos = 0; c->multi_seq = 0; c->qs_seq_orient = 1; + c->no_ref = 0; + c->embed_ref = -1; // automatic selection c->bams = NULL; @@ -4823,6 +4857,11 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } + if (-1 == refs_from_header(fd)) + return -1; + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + /* Fix M5 strings */ if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { int i; @@ -4870,6 +4909,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { cram_ref_decr(fd->refs, i); hts_md5_hex(buf2, buf); + fd->refs->ref_id[i]->validated_md5 = 1; if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) return -1; } @@ -4997,11 +5037,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { cram_free_container(c); } - if (-1 == refs_from_header(fd)) - return -1; - if (-1 == refs2id(fd->refs, fd->header)) - return -1; - if (0 != hflush(fd->fp)) return -1; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 16739c2c6..1ee4b9e85 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -88,7 +88,7 @@ struct hFILE; #define BASES_PER_SLICE (SEQS_PER_SLICE*500) #define SLICE_PER_CNT 1 -#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" +#define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT" #define MAX_STAT_VAL 1024 //#define MAX_STAT_VAL 16 @@ -457,6 +457,8 @@ struct cram_container { /* Copied from fd before encoding, to allow multi-threading */ int ref_start, first_base, last_base, ref_id, ref_end; char *ref; + int embed_ref; // 1 if embedding ref, 2 if embedding cons + int no_ref; // true if referenceless //struct ref_entry *ref; /* For multi-threading */ @@ -675,6 +677,7 @@ typedef struct ref_entry { char *seq; mFILE *mf; int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; } ref_entry; KHASH_MAP_INIT_STR(refs, ref_entry*) @@ -792,14 +795,14 @@ struct cram_fd { cram_container *ctr_mt; // positions for encoding or decoding - int first_base, last_base; + int first_base, last_base; // copied to container // cached reference portion refs_t *refs; // ref meta-data structure char *ref, *ref_free; // current portion held in memory - int ref_id; - int ref_start; - int ref_end; + int ref_id; // copied to container + int ref_start; // copied to container + int ref_end; // copied to container char *ref_fn; // reference fasta filename // compression level and metrics @@ -812,8 +815,8 @@ struct cram_fd { int seqs_per_slice; int bases_per_slice; int slices_per_container; - int embed_ref; - int no_ref; + int embed_ref; // copied to container + int no_ref; // copied to container int ignore_md5; int use_bz2; int use_rans; diff --git a/faidx.c b/faidx.c index f3be5e57c..2eb0f3edc 100644 --- a/faidx.c +++ b/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA and FASTQ random access. - Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -702,6 +702,12 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return NULL; } + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return NULL; + } + ret = bgzf_useek(fai->bgzf, offset + beg / val->line_blen * val->line_len @@ -766,6 +772,22 @@ static int fai_get_val(const faidx_t *fai, const char *str, return 0; } +/* + * The internal still has line_blen as uint32_t, but our references + * can be longer, so for future proofing we use hts_pos_t. We also needed + * a signed value so we can return negatives as an error. + */ +hts_pos_t fai_line_length(const faidx_t *fai, const char *str) +{ + faidx1_t val; + int64_t beg, end; + hts_pos_t len; + + if (fai_get_val(fai, str, &len, &val, &beg, &end)) + return -1; + else + return val.line_blen; +} char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { @@ -822,26 +844,40 @@ const char *faidx_iseq(const faidx_t *fai, int i) return fai->name[i]; } -int faidx_seq_len(const faidx_t *fai, const char *seq) +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq) { khint_t k = kh_get(s, fai->hash, seq); if ( k == kh_end(fai->hash) ) return -1; return kh_val(fai->hash, k).len; } -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { +int faidx_seq_len(const faidx_t *fai, const char *seq) +{ + hts_pos_t len = faidx_seq_len64(fai, seq); + return len < INT_MAX ? len : INT_MAX; +} + +static int faidx_adjust_position(const faidx_t *fai, int end_adjust, + faidx1_t *val_out, const char *c_name, + hts_pos_t *p_beg_i, hts_pos_t *p_end_i, + hts_pos_t *len) { khiter_t iter; + faidx1_t *val; // Adjust position iter = kh_get(s, fai->hash, c_name); if (iter == kh_end(fai->hash)) { - *len = -2; + if (len) + *len = -2; hts_log_error("The sequence \"%s\" was not found", c_name); return 1; } - *val = kh_value(fai->hash, iter); + val = &kh_value(fai->hash, iter); + + if (val_out) + *val_out = *val; if(*p_end_i < *p_beg_i) *p_beg_i = *p_end_i; @@ -849,22 +885,42 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * if(*p_beg_i < 0) *p_beg_i = 0; else if(val->len <= *p_beg_i) - *p_beg_i = val->len - 1; + *p_beg_i = val->len; if(*p_end_i < 0) *p_end_i = 0; else if(val->len <= *p_end_i) - *p_end_i = val->len - 1; + *p_end_i = val->len - end_adjust; return 0; } +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t orig_beg, orig_end; + + if (!fai || !beg || !end || tid < 0 || tid >= fai->n) + return -1; + + orig_beg = *beg; + orig_end = *end; + if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) { + hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"", + fai->name[tid]); + return -1; + } + + return ((orig_beg != *beg ? 1 : 0) | + (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); +} + char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } @@ -885,7 +941,7 @@ char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } diff --git a/hts.c b/hts.c index 8b437f2b9..cead9d537 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -1934,8 +1934,9 @@ char **hts_readlist(const char *string, int is_file, int *_n) if ( !fp ) return NULL; kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char*, n + 1, &m, &s, 0) < 0) @@ -1945,6 +1946,8 @@ char **hts_readlist(const char *string, int is_file, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } @@ -1991,8 +1994,9 @@ char **hts_readlines(const char *fn, int *_n) BGZF *fp = bgzf_open(fn, "r"); if ( fp ) { // read from file kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) { + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char *, n + 1, &m, &s, 0) < 0) goto err; @@ -2001,6 +2005,8 @@ char **hts_readlines(const char *fn, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } else if (*fn == ':') { // read from string @@ -2354,9 +2360,9 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) return 0; if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos - " cannot be stored in a csi index. " - "Please check headers match the data", + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" " + "cannot be stored in a csi index with these parameters. " + "Please use a larger min_shift or depth", beg, end); } else { hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos @@ -3077,16 +3083,14 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t free(iter); iter = NULL; } + } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { + iter->finished = 1; } else { if (beg < 0) beg = 0; if (end < beg) { free(iter); return NULL; } - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { - free(iter); - return NULL; - } k = kh_get(bin, bidx, META_BIN(idx)); if (k != kh_end(bidx)) diff --git a/htscodecs b/htscodecs index 3ef17f6fb..cd0737fff 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8 +Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c diff --git a/htsfile.1 b/htsfile.1 index cf87bef3d..9e0b0b603 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,4 +1,4 @@ -.TH htsfile 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" +.TH htsfile 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff --git a/htsfile.c b/htsfile.c index 0391a98fa..9f7bf4531 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 9f77cb585..b37eacac3 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,4 +1,4 @@ -.TH htslib-s3-plugin 7 "18 August 2022" "htslib-1.16" "Bioinformatics tools" +.TH htslib-s3-plugin 7 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME s3 plugin \- htslib AWS S3 plugin .\" @@ -207,8 +207,8 @@ addressing_style or host_bucket. The first two can be set to \fBpath\fR while host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. .SH "SEE ALSO" -.BR htsfile (1) -.BR samtools (1) +.IR htsfile (1) +.IR samtools (1) .PP RFC 3339: .PP diff --git a/htslib.map b/htslib.map new file mode 100644 index 000000000..11411a6e3 --- /dev/null +++ b/htslib.map @@ -0,0 +1,632 @@ +HTSLIB_1.0 { + bam_aux2A; + bam_aux2Z; + bam_aux2f; + bam_aux2i; + bam_aux_append; + bam_aux_del; + bam_aux_get; + bam_cigar2qlen; + bam_cigar2rlen; + bam_copy1; + bam_destroy1; + bam_dup1; + bam_endpos; + bam_flag2str; + bam_hdr_read; + bam_hdr_write; + bam_init1; + bam_mplp_auto; + bam_mplp_destroy; + bam_mplp_init; + bam_mplp_init_overlaps; + bam_mplp_set_maxcnt; + bam_plp_auto; + bam_plp_destroy; + bam_plp_init; + bam_plp_next; + bam_plp_push; + bam_plp_reset; + bam_plp_set_maxcnt; + bam_read1; + bam_str2flag; + bam_write1; + bcf_add_filter; + bcf_calc_ac; + bcf_clear; + bcf_destroy; + bcf_dup; + bcf_enc_vchar; + bcf_enc_vfloat; + bcf_enc_vint; + bcf_float_missing; + bcf_float_vector_end; + bcf_fmt_array; + bcf_fmt_sized_array; + bcf_get_fmt; + bcf_get_format_string; + bcf_get_format_values; + bcf_get_info; + bcf_get_info_values; + bcf_get_variant_type; + bcf_get_variant_types; + bcf_gt_type; + bcf_has_filter; + bcf_hdr_add_hrec; + bcf_hdr_add_sample; + bcf_hdr_append; + bcf_hdr_combine; + bcf_hdr_destroy; + bcf_hdr_dup; + bcf_hdr_fmt_text; + bcf_hdr_get_hrec; + bcf_hdr_get_version; + bcf_hdr_id2int; + bcf_hdr_init; + bcf_hdr_parse; + bcf_hdr_parse_line; + bcf_hdr_printf; + bcf_hdr_read; + bcf_hdr_remove; + bcf_hdr_seqnames; + bcf_hdr_set; + bcf_hdr_set_samples; + bcf_hdr_set_version; + bcf_hdr_subset; + bcf_hdr_sync; + bcf_hdr_write; + bcf_hrec_add_key; + bcf_hrec_destroy; + bcf_hrec_dup; + bcf_hrec_find_key; + bcf_hrec_format; + bcf_hrec_set_val; + bcf_index_build; + bcf_init; + bcf_is_snp; + bcf_read; + bcf_readrec; + bcf_remove_alleles; + bcf_remove_filter; + bcf_sr_add_reader; + bcf_sr_destroy; + bcf_sr_init; + bcf_sr_next_line; + bcf_sr_regions_destroy; + bcf_sr_regions_flush; + bcf_sr_regions_init; + bcf_sr_regions_next; + bcf_sr_regions_overlap; + bcf_sr_regions_seek; + bcf_sr_remove_reader; + bcf_sr_seek; + bcf_sr_set_regions; + bcf_sr_set_samples; + bcf_sr_set_targets; + bcf_subset; + bcf_subset_format; + bcf_sweep_bwd; + bcf_sweep_destroy; + bcf_sweep_fwd; + bcf_sweep_hdr; + bcf_sweep_init; + bcf_translate; + bcf_trim_alleles; + bcf_type_shift; + bcf_unpack; + bcf_update_alleles; + bcf_update_alleles_str; + bcf_update_filter; + bcf_update_format; + bcf_update_format_string; + bcf_update_id; + bcf_update_info; + bcf_write; + bgzf_check_EOF; + bgzf_close; + bgzf_dopen; + bgzf_flush; + bgzf_flush_try; + bgzf_getc; + bgzf_getline; + bgzf_hopen; + bgzf_index_build_init; + bgzf_index_dump; + bgzf_index_load; + bgzf_is_bgzf; + bgzf_mt; + bgzf_open; + bgzf_raw_read; + bgzf_raw_write; + bgzf_read; + bgzf_read_block; + bgzf_seek; + bgzf_set_cache_size; + bgzf_useek; + bgzf_utell; + bgzf_write; + cram_close; + cram_compress_block; + cram_dopen; + cram_eof; + cram_flush; + cram_free_block; + cram_free_container; + cram_new_block; + cram_new_container; + cram_open; + cram_read_block; + cram_read_container; + cram_seek; + cram_set_header; + cram_set_option; + cram_set_voption; + cram_uncompress_block; + cram_write_block; + cram_write_container; + fai_build; + fai_destroy; + fai_fetch; + fai_load; + faidx_fetch_nseq; + faidx_fetch_seq; + faidx_has_seq; + hclose; + hclose_abruptly; + hdopen; + hfile_destroy; + hfile_init; + hfile_oflags; + hflush; + hgetc2; + hopen; + hpeek; + hputc2; + hputs2; + hread2; + hrec_add_idx; + hseek; + hts_close; + hts_file_type; + hts_get_bgzfp; + hts_getline; + hts_idx_destroy; + hts_idx_finish; + hts_idx_get_meta; + hts_idx_get_n_no_coor; + hts_idx_get_stat; + hts_idx_init; + hts_idx_load; + hts_idx_push; + hts_idx_save; + hts_idx_seqnames; + hts_idx_set_meta; + hts_itr_destroy; + hts_itr_next; + hts_itr_query; + hts_itr_querys; + hts_open; + hts_parse_reg; + hts_readlines; + hts_readlist; + hts_set_fai_filename; + hts_set_threads; + hts_verbose; + hts_version; + hwrite2; + kf_betai; + kf_erfc; + kf_gammap; + kf_gammaq; + kf_lgamma; + kmemmem; + knet_close; + knet_dopen; + knet_open; + knet_read; + knet_seek; + ksplit_core; + ksprintf; + kstrnstr; + kstrstr; + kstrtok; + kt_fisher_exact; + kvsprintf; + sam_format1; + sam_hdr_add_lines; + sam_hdr_dup; + sam_hdr_incr_ref; + sam_hdr_length; + sam_hdr_parse; + sam_hdr_read; + sam_hdr_str; + sam_hdr_write; + sam_index_load; + sam_itr_queryi; + sam_itr_querys; + sam_open_mode; + sam_parse1; + sam_read1; + sam_write1; + seq_nt16_str; + seq_nt16_table; + stringify_argv; + tbx_conf_bed; + tbx_conf_gff; + tbx_conf_psltbl; + tbx_conf_sam; + tbx_conf_vcf; + tbx_destroy; + tbx_index; + tbx_index_build; + tbx_index_load; + tbx_name2id; + tbx_readrec; + tbx_seqnames; + vcf_format; + vcf_hdr_read; + vcf_hdr_write; + vcf_parse; + vcf_read; + vcf_write; + vcf_write_line; +}; + +HTSLIB_1.1 { + bcf_get_fmt_id; + bcf_get_info_id; + faidx_iseq; + faidx_nseq; + faidx_seq_len; +} HTSLIB_1.0; + + +HTSLIB_1.2.1 { + bcf_copy; + bcf_sr_strerror; + hisremote; + hts_detect_format; + hts_format_description; + hts_get_format; + hts_hopen; + hts_set_opt; + regidx_destroy; + regidx_init; + regidx_insert; + regidx_nregs; + regidx_overlap; + regidx_parse_bed; + regidx_parse_tab; + regidx_seq_names; + regidx_seq_nregs; + seq_nt16_int; +} HTSLIB_1.1; + +HTSLIB_1.3 { + bcf_add_id; + bcf_empty; + bcf_hdr_merge; + bcf_index_build2; + bcf_index_load2; + bcf_remove_allele_set; + bgzf_compress; + cram_block_append; + cram_block_get_comp_size; + cram_block_get_content_id; + cram_block_get_content_type; + cram_block_get_crc32; + cram_block_get_data; + cram_block_get_offset; + cram_block_get_uncomp_size; + cram_block_set_comp_size; + cram_block_set_content_id; + cram_block_set_crc32; + cram_block_set_data; + cram_block_set_offset; + cram_block_set_uncomp_size; + cram_block_size; + cram_block_update_size; + cram_container_get_landmarks; + cram_container_get_length; + cram_container_get_num_blocks; + cram_container_is_empty; + cram_container_set_landmarks; + cram_container_set_length; + cram_container_set_num_blocks; + cram_container_size; + cram_copy_slice; + cram_fd_get_fp; + cram_fd_get_header; + cram_fd_get_version; + cram_fd_set_fp; + cram_fd_set_header; + cram_fd_set_version; + cram_major_vers; + cram_minor_vers; + cram_store_container; + cram_transcode_rg; + hfile_add_scheme_handler; + hfile_always_local; + hfile_always_remote; + hts_format_file_extension; + hts_idx_load2; + hts_idx_save_as; + hts_md5_destroy; + hts_md5_final; + hts_md5_hex; + hts_md5_init; + hts_md5_reset; + hts_md5_update; + hts_open_format; + hts_opt_add; + hts_opt_apply; + hts_opt_free; + hts_parse_decimal; + hts_parse_format; + hts_parse_opt_list; + int32_put_blk; + kgetline; + sam_index_build; + sam_index_build2; + sam_index_load2; + sam_open_mode_opts; + tbx_index_build2; + tbx_index_load2; +} HTSLIB_1.2.1; + +HTSLIB_1.4 { + bam_auxB2f; + bam_auxB2i; + bam_auxB_len; + bam_aux_update_str; + bam_mplp_constructor; + bam_mplp_destructor; + bam_mplp_reset; + bam_plp_constructor; + bam_plp_destructor; + bcf_hdr_format; + bcf_index_build3; + bcf_sr_destroy_threads; + bcf_sr_set_opt; + bcf_sr_set_threads; + bgzf_block_write; + bgzf_compression; + bgzf_index_dump_hfile; + bgzf_index_load_hfile; + bgzf_thread_pool; + cram_check_EOF; + cram_get_refs; + errmod_cal; + errmod_destroy; + errmod_init; + fai_build3; + fai_load3; + hgetdelim; + hgets; + hts_check_EOF; + hts_json_fnext; + hts_json_fskip_value; + hts_json_snext; + hts_json_sskip_value; + hts_realloc_or_die; + hts_set_cache_size; + hts_set_thread_pool; + hts_tpool_delete_result; + hts_tpool_destroy; + hts_tpool_dispatch; + hts_tpool_dispatch2; + hts_tpool_init; + hts_tpool_kill; + hts_tpool_next_result; + hts_tpool_next_result_wait; + hts_tpool_process_attach; + hts_tpool_process_destroy; + hts_tpool_process_detach; + hts_tpool_process_empty; + hts_tpool_process_flush; + hts_tpool_process_init; + hts_tpool_process_len; + hts_tpool_process_qsize; + hts_tpool_process_ref_decr; + hts_tpool_process_ref_incr; + hts_tpool_process_reset; + hts_tpool_process_shutdown; + hts_tpool_process_sz; + hts_tpool_result_data; + hts_tpool_size; + hts_tpool_wake_dispatch; + kputd; + probaln_glocal; + sam_cap_mapq; + sam_index_build3; + sam_prob_realn; + tbx_index_build3; +} HTSLIB_1.3; + +HTSLIB_1.5 { + hfile_set_blksize; + hts_get_log_level; + hts_log; + hts_set_log_level; +} HTSLIB_1.4; + +HTSLIB_1.6 { + hts_drand48; + hts_erand48; + hts_lrand48; + hts_srand48; +} HTSLIB_1.5; + +HTSLIB_1.7 { + hfile_mem_get_buffer; + hfile_mem_steal_buffer; + hts_itr_multi_bam; + hts_itr_multi_cram; + hts_itr_multi_next; + hts_itr_regions; + hts_json_alloc_token; + hts_json_free_token; + hts_json_token_str; + hts_json_token_type; + hts_reglist_free; + sam_hdr_change_HD; + sam_itr_regions; +} HTSLIB_1.6; + +HTSLIB_1.9 { + bam_aux_update_array; + bam_aux_update_float; + bam_aux_update_int; + fai_fetchqual; + fai_load3_format; + fai_load_format; + faidx_fetch_qual; +} HTSLIB_1.7; + +HTSLIB_1.10 { + bam_cigar_table; + bam_mplp64_auto; + bam_plp64_auto; + bam_plp64_next; + bam_plp_insertion; + bam_set_qname; + bcf_idx_init; + bcf_idx_save; + bcf_index_load3; + bgzf_peek; + fai_fetch64; + fai_fetchqual64; + fai_parse_region; + fai_set_cache_size; + faidx_fetch_qual64; + faidx_fetch_seq64; + haddextension; + hts_free; + hts_idx_fmt; + hts_idx_load3; + hts_idx_tbi_name; + hts_parse_reg64; + hts_parse_region; + hts_reglist_create; + hts_resize_array_; + hts_tpool_dispatch3; + kgetline2; + regidx_init_string; + regidx_insert_list; + regidx_parse_reg; + regidx_parse_vcf; + regidx_push; + regitr_copy; + regitr_destroy; + regitr_init; + regitr_loop; + regitr_overlap; + regitr_reset; + sam_hdr_add_line; + sam_hdr_add_pg; + sam_hdr_count_lines; + sam_hdr_destroy; + sam_hdr_find_line_id; + sam_hdr_find_line_pos; + sam_hdr_find_tag_id; + sam_hdr_find_tag_pos; + sam_hdr_init; + sam_hdr_line_index; + sam_hdr_line_name; + sam_hdr_name2tid; + sam_hdr_nref; + sam_hdr_pg_id; + sam_hdr_remove_except; + sam_hdr_remove_line_id; + sam_hdr_remove_line_pos; + sam_hdr_remove_lines; + sam_hdr_remove_tag_id; + sam_hdr_tid2len; + sam_hdr_tid2name; + sam_hdr_update_line; + sam_idx_init; + sam_idx_save; + sam_index_load3; + sam_itr_regarray; + sam_parse_region; + tbx_index_load3; +} HTSLIB_1.9; + +HTSLIB_1.11 { + fai_path; + hts_lib_shutdown; + hts_tpool_process_is_shutdown; + vcf_open_mode; +} HTSLIB_1.10; + +HTSLIB_1.12 { + bam_parse_cigar; + bam_set1; + hfile_has_plugin; + hfile_list_plugins; + hfile_list_schemes; + hts_feature_string; + hts_features; + hts_filter_eval; + hts_filter_free; + hts_filter_init; + hts_set_filter_expression; + hts_test_feature; + sam_parse_cigar; + sam_passes_filter; +} HTSLIB_1.11; + +HTSLIB_1.13 { + hts_idx_nseq; +} HTSLIB_1.12; + +HTSLIB_1.14 { + bam_mods_at_next_pos; + bam_mods_at_qpos; + bam_next_basemod; + bam_parse_basemod; + bam_plp_insertion_mod; + hts_base_mod_state_alloc; + hts_base_mod_state_free; + hts_flush; +} HTSLIB_1.13; + +HTSLIB_1.15 { + hts_detect_format2; +} HTSLIB_1.14; + +HTSLIB_1.16 { + bam_mods_query_type; + bam_mods_recorded; + bcf_has_variant_type; + bcf_has_variant_types; + bcf_variant_length; + cram_decode_slice_header; + cram_free_slice_header; + cram_slice_hdr_get_coords; + cram_slice_hdr_get_embed_ref_id; + cram_slice_hdr_get_num_blocks; + hts_filter_eval2; +} HTSLIB_1.15; + +HTSLIB_1.17 { + bam_aux_first; + bam_aux_next; + bam_aux_remove; + bcf_strerror; + cram_block_get_method; + cram_cid2ds_free; + cram_cid2ds_query; + cram_codec_describe; + cram_codec_get_content_ids; + cram_container_get_num_bases; + cram_container_get_num_records; + cram_decode_compression_header; + cram_describe_encodings; + cram_expand_method; + cram_free_compression_header; + cram_update_cid2ds_map; + fai_adjust_region; + fai_line_length; + faidx_seq_len64; +} HTSLIB_1.16; diff --git a/htslib/bgzf.h b/htslib/bgzf.h index c4ba85679..cb789ad53 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -303,7 +303,8 @@ typedef struct BGZF BGZF; * @param fp BGZF file handler * @param delim delimiter * @param str string to write to; must be initialized - * @return length of the string; -1 on end-of-file; <= -2 on error + * @return length of the string (capped at INT_MAX); + * -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT int bgzf_getline(BGZF *fp, int delim, struct kstring_t *str); @@ -315,23 +316,22 @@ typedef struct BGZF BGZF; int bgzf_read_block(BGZF *fp) HTS_RESULT_USED; /** - * Enable multi-threading (when compiled with -DBGZF_MT) via a shared - * thread pool. This means both encoder and decoder can balance - * usage across a single pool of worker jobs. + * Enable multi-threading via a shared thread pool. This means + * both encoder and decoder can balance usage across a single pool + * of worker jobs. * - * @param fp BGZF file handler; must be opened for writing + * @param fp BGZF file handler * @param pool The thread pool (see hts_create_threads) */ HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); /** - * Enable multi-threading (only effective when the library was compiled - * with -DBGZF_MT) + * Enable multi-threading * - * @param fp BGZF file handler; must be opened for writing - * @param n_threads #threads used for writing - * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + * @param fp BGZF file handler + * @param n_threads #threads used for reading / writing + * @param n_sub_blks Unused (was #blocks processed by each thread) */ HTSLIB_EXPORT int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); diff --git a/htslib/cram.h b/htslib/cram.h index 8dc6fe1b3..e0b51839c 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield @@ -76,8 +76,59 @@ enum cram_block_method { RANS1 = 10, GZIP_RLE = 11, }; +#else + +// Values as defined in the CRAM specifications. +// See cram/cram_structs.h cram_block_method_int for an expanded version of +// this with local specialisations assigned to codes. +enum cram_block_method { + CRAM_COMP_UNKNOWN = -1, + + // CRAM 2.x and 3.0 + CRAM_COMP_RAW = 0, + CRAM_COMP_GZIP = 1, + CRAM_COMP_BZIP2 = 2, + + // CRAM 3.0 + CRAM_COMP_LZMA = 3, + CRAM_COMP_RANS4x8 = 4, // 4-way interleaving, 8-bit renormalisation + + // CRAM 3.1 + CRAM_COMP_RANSNx16 = 5, // both 4x16 and 32x16 variants, plus transforms + CRAM_COMP_ARITH = 6, // aka Range coding + CRAM_COMP_FQZ = 7, // FQZComp + CRAM_COMP_TOK3 = 8, // Name tokeniser +}; #endif +/* NOTE this structure may be expanded in future releases by appending + * additional fields. + * + * Do not assume the size is fixed and avoid using arrays of this struct. + */ +typedef struct { + enum cram_block_method method; + + // Generic compression level if known (0 if not). + // 1 or 9 for gzip min/max flag (else 5). 1-9 for bzip2 + // 1 or 11 for for tok3 (rans/arith encoder). + int level; + + // For rans* and arith codecs + int order; + + // ransNx16/arith specific + int rle; + int pack; + int stripe; + int cat; + int nosz; + int Nway; + + // Arithmetic coder only + int ext; // external: use gz, xz or bzip2 +} cram_method_details; + enum cram_content_type { CT_ERROR = -1, FILE_HEADER = 0, @@ -97,6 +148,7 @@ typedef struct cram_slice cram_slice; typedef struct cram_metrics cram_metrics; typedef struct cram_block_slice_hdr cram_block_slice_hdr; typedef struct cram_block_compression_hdr cram_block_compression_hdr; +typedef struct cram_codec cram_codec; typedef struct refs_t refs_t; struct hFILE; @@ -147,6 +199,10 @@ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) HTSLIB_EXPORT void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, int32_t *landmarks); +HTSLIB_EXPORT +int32_t cram_container_get_num_records(cram_container *c); +HTSLIB_EXPORT +int64_t cram_container_get_num_bases(cram_container *c); /* Returns true if the container is empty (EOF marker) */ HTSLIB_EXPORT @@ -167,9 +223,14 @@ HTSLIB_EXPORT int32_t cram_block_get_crc32(cram_block *b); HTSLIB_EXPORT void * cram_block_get_data(cram_block *b); - HTSLIB_EXPORT enum cram_content_type cram_block_get_content_type(cram_block *b); +HTSLIB_EXPORT +enum cram_block_method cram_block_get_method(cram_block *b); + +HTSLIB_EXPORT +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp); HTSLIB_EXPORT void cram_block_set_content_id(cram_block *b, int32_t id); @@ -200,6 +261,27 @@ void cram_block_set_offset(cram_block *b, size_t offset); HTSLIB_EXPORT uint32_t cram_block_size(cram_block *b); +/* + * Returns the Block Content ID values referred to by a cram_codec in + * ids[2]. + * + * -2 is unused. + * -1 is CORE + * >= 0 is the block with that Content ID + */ +HTSLIB_EXPORT +void cram_codec_get_content_ids(cram_codec *c, int ids[2]); + +/* + * Produces a human readable description of the codec parameters. + * This is appended to an existing kstring 'ks'. + * + * Returns 0 on succes, + * <0 on failure + */ +HTSLIB_EXPORT +int cram_codec_describe(cram_codec *c, kstring_t *ks); + /* * Renumbers RG numbers in a cram compression header. * @@ -247,6 +329,66 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + * Decodes a CRAM block compression header. + * Returns header ptr on success + * NULL on failure + */ +HTSLIB_EXPORT +cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, + cram_block *b); +/* + * Frees a cram_block_compression_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_compression_header(cram_block_compression_hdr *hdr); + +typedef struct cram_cid2ds_t cram_cid2ds_t; + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds); + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +HTSLIB_EXPORT +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n); + +/* + * Frees a cram_cid2ds_t allocated by cram_update_cid2ds_map + */ +HTSLIB_EXPORT +void cram_cid2ds_free(cram_cid2ds_t *cid2ds); + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +HTSLIB_EXPORT +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks); + /* *----------------------------------------------------------------------------- * cram slice interrogation diff --git a/htslib/faidx.h b/htslib/faidx.h index 149cebd2e..c1b3090a5 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022 Genome Research Ltd. Author: Heng Li @@ -188,6 +188,15 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); HTSLIB_EXPORT char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); +/// Query the line-wrap length for a chromosome specified as part of a region +/** @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @return The line length (excluding newline), + negative on error. +*/ +HTSLIB_EXPORT +hts_pos_t fai_line_length(const faidx_t *fai, const char *reg); + /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param reg Region in the format "chr2:20,000-30,000" @@ -283,7 +292,22 @@ int faidx_nseq(const faidx_t *fai); HTSLIB_EXPORT const char *faidx_iseq(const faidx_t *fai, int i); -/// Return sequence length, -1 if not present +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present +*/ +HTSLIB_EXPORT +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq); + +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present + + @deprecated This funtion cannot handle very long sequences. + Use faidx_seq_len64() instead. +*/ HTSLIB_EXPORT int faidx_seq_len(const faidx_t *fai, const char *seq); @@ -305,6 +329,27 @@ const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, int flags); +/// Adjust region to the actual sequence length +/** @param fai Pointer to the faidx_t struct + @param tid Sequence index, as returned by fai_parse_region() + @param beg[in,out] The start of the region (0 based) + @param end[in,out] One past end of the region (0 based) + @return 1, 2, or 3 if @p beg, @p end, or both are adjusted, + 0 if @p beg and @p end are unchanged + -1 on error + + Looks up the length of @p tid, and then adjusts the values of @p beg + and @p end if they fall outside the boundaries of the sequence. + + If @p beg > @p end, it will be set to @p end. + + The return value indicates which, if any, of the inputs have been + adjusted. -1 will be returned if @p tid is not a valid sequence index. +*/ +HTSLIB_EXPORT +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end); + /// Sets the cache size of the underlying BGZF compressed file /** @param fai Pointer to the faidx_t struct * @param cache_size Selected cache size in bytes diff --git a/htslib/hts.h b/htslib/hts.h index 38246cd9c..403152d0c 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -489,7 +489,7 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 101600 +#define HTS_VERSION 101700 /*! @abstract Introspection on the features enabled in htslib * diff --git a/htslib/sam.h b/htslib/sam.h index 5f8c0a554..514a6be04 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -189,7 +189,7 @@ extern const int8_t bam_cigar_table[256]; * Mate position and insert size also need to be 64-bit, but * we won't accept more than 32-bit for tid. * - * The bam_core_t structure is the *in memory* layout and not + * The bam1_core_t structure is the *in memory* layout and not * the same as the on-disk format. 64-bit changes here permit * SAM to work with very long chromosomes and permit BAM and CRAM * to seamlessly update in the future without further API/ABI @@ -1438,7 +1438,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, /// Converts a BAM aux tag to SAM format /* - * @param b Pointer to the bam record * @param key Two letter tag key * @param type Single letter type code: ACcSsIifHZB. * @param tag Tag data pointer, in BAM format @@ -1628,6 +1627,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, return NULL; } +/// Return a pointer to a BAM record's first aux field +/** @param b Pointer to the BAM record + @return Aux field pointer, or NULL if the record has none + +When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers" +point to the TYPE byte within the auxiliary data for that field; but in general +it is unnecessary for user code to be aware of this.) + */ +HTSLIB_EXPORT +uint8_t *bam_aux_first(const bam1_t *b); + +/// Return a pointer to a BAM record's next aux field +/** @param b Pointer to the BAM record + @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the next aux field, or NULL if no next field or error + +Whenever NULL is returned, errno will also be set: ENOENT if @p s was the +record's last aux field; otherwise EINVAL, indicating that the BAM record's +aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s); + /// Return a pointer to an aux record /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1640,6 +1662,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, HTSLIB_EXPORT uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); +/// Return the aux field's 2-character tag +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the tag characters, NOT NUL-terminated + */ +static inline +const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); } + +/// Return the aux field's type character +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return The type character: one of cCsSiI/fd/A/Z/H/B + */ +static inline char bam_aux_type(const uint8_t *s) { return *s; } + /// Return a SAM formatting string containing a BAM tag /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1742,15 +1777,33 @@ HTSLIB_EXPORT int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data); /// Delete tag data from a bam record -/* @param b The bam record to update - @param s Pointer to the tag to delete, as returned by bam_aux_get(). - @return 0 on success; -1 on failure - If the bam record's aux data is corrupt, errno is set to EINVAL and this - function returns -1; +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by bam_aux_get() + Must not be NULL + @return 0 on success; -1 on failure + +If the BAM record's aux data is corrupt, errno is set to EINVAL and this +function returns -1. */ HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); +/// Delete an aux field from a BAM record +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by + bam_aux_first()/_next()/_get(); must not be NULL + @return Pointer to the following aux field, or NULL if none or on error + +Identical to @c bam_aux_del() apart from the return value, which is an +aux iterator suitable for use with @c bam_aux_next()/etc. + +Whenever NULL is returned, errno will also be set: ENOENT if the aux field +deleted was the record's last one; otherwise EINVAL, indicating that the +BAM record's aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s); + /// Update or add a string-type tag /* @param b The bam record to update @param tag Tag identifier diff --git a/htslib/vcf.h b/htslib/vcf.h index c94bea589..0d9f812ce 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2020, 2022 Genome Research Ltd. Author: Heng Li @@ -205,6 +205,22 @@ typedef struct bcf_dec_t { #define BCF_ERR_CTG_INVALID 32 #define BCF_ERR_TAG_INVALID 64 +/// Get error description for bcf error code +/** @param errorcode The error code which is to be described + @param buffer The buffer in which description to be added + @param maxbuffer The size of buffer passed + @return NULL on invalid buffer; buffer on other cases + +The buffer will be an empty string when @p errorcode is 0. +Description of errors present in code will be appended to @p buffer with ',' separation. +The buffer has to be at least 4 characters long. NULL will be returned if it is smaller or when buffer is NULL. + +'...' will be appended if the description doesn't fit in the given buffer. + */ + +HTSLIB_EXPORT +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer); + /* The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file is slower because the string is first to be parsed, packed into BCF line @@ -620,7 +636,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); - /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + /** + * Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names). + * NB: sequence name indexes returned by bcf_hdr_seqnames() may not correspond to bcf1_t.rid, use + * bcf_hdr_id2name() or bcf_seqname() instead. + */ HTSLIB_EXPORT const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); @@ -1210,7 +1230,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) - #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) + #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) /// Convert BCF FORMAT data to string form /** diff --git a/sam.c b/sam.c index c95d1c693..e2e539b2d 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -3853,7 +3853,6 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { } // Name - if (*x->name.s != x->nprefix) return -2; @@ -3893,8 +3892,8 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) if (fp->format.format == fastq_format || ret < -1) return -2; - if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>') - || ret == -1) + if (ret == -1 || + *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) break; if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) return -2; @@ -4286,13 +4285,24 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) bc ? (char *)bc+1 : "0") < 0) return -1; + if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { + hts_log_warning("BC tag starts with non-sequence base; using '0'"); + str->l -= strlen((char *)bc)-2; // limit to 1 char + str->s[str->l-1] = '0'; + str->s[str->l] = 0; + bc = NULL; + } + // Replace any non-alpha with '+'. Ie seq-seq to seq+seq if (bc) { int l = strlen((char *)bc+1); char *c = (char *)str->s + str->l - l; - for (i = 0; i < l; i++) + for (i = 0; i < l; i++) { if (!isalpha_c(c[i])) c[i] = '+'; + else if (islower_c(c[i])) + c[i] = toupper_c(c[i]); + } } } @@ -4614,31 +4624,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } +uint8_t *bam_aux_first(const bam1_t *b) +{ + uint8_t *s = bam_get_aux(b); + uint8_t *end = b->data + b->l_data; + if (s >= end) { errno = ENOENT; return NULL; } + return s+2; +} + +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; + if (next == NULL) goto bad_aux; + if (next >= end) { errno = ENOENT; return NULL; } + return next+2; + + bad_aux: + hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); + errno = EINVAL; + return NULL; +} + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { - uint8_t *s, *end, *t = (uint8_t *) tag; - uint16_t y = (uint16_t) t[0]<<8 | t[1]; - s = bam_get_aux(b); - end = b->data + b->l_data; - while (s != NULL && end - s >= 3) { - uint16_t x = (uint16_t) s[0]<<8 | s[1]; - s += 2; - if (x == y) { + uint8_t *s; + for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) + if (s[-2] == tag[0] && s[-1] == tag[1]) { // Check the tag value is valid and complete - uint8_t *e = skip_aux(s, end); - if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') { - goto bad_aux; // Unterminated string - } - if (e != NULL) { - return s; - } else { - goto bad_aux; - } + uint8_t *e = skip_aux(s, b->data + b->l_data); + if (e == NULL) goto bad_aux; + if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; + + return s; } - s = skip_aux(s, end); - } - if (s == NULL) goto bad_aux; - errno = ENOENT; + + // errno now as set by bam_aux_first()/bam_aux_next() return NULL; bad_aux: @@ -4647,23 +4668,28 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) return NULL; } -// s MUST BE returned by bam_aux_get() int bam_aux_del(bam1_t *b, uint8_t *s) { - uint8_t *p, *aux; - int l_aux = bam_get_l_aux(b); - aux = bam_get_aux(b); - p = s - 2; - s = skip_aux(s, aux + l_aux); - if (s == NULL) goto bad_aux; - memmove(p, s, l_aux - (s - aux)); - b->l_data -= s - p; - return 0; + s = bam_aux_remove(b, s); + return (s || errno == ENOENT)? 0 : -1; +} + +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = skip_aux(s, end); + if (next == NULL) goto bad_aux; + + b->l_data -= next - (s-2); + if (next >= end) { errno = ENOENT; return NULL; } + + memmove(s-2, next, end - next); + return s; bad_aux: hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); errno = EINVAL; - return -1; + return NULL; } int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) @@ -5238,9 +5264,24 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation int op2 = _cop(cigar[s->k+1]); int l2 = _cln(cigar[s->k+1]); - if (op2 == BAM_CDEL) p->indel = -(int)l2; - else if (op2 == BAM_CINS) p->indel = l2; - else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + if (op2 == BAM_CDEL && op != BAM_CDEL) { + // At start of a new deletion, merge e.g. 1D2D to 3D. + // Within a deletion (the 2D in 1D2D) we keep p->indel=0 + // and rely on is_del=1 as we would for 3D. + p->indel = -(int)l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CDEL) p->indel -= l2; + else break; + } + } else if (op2 == BAM_CINS) { + p->indel = l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) p->indel += l2; + else if (op2 != BAM_CPAD) break; + } + } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { int l3 = 0; for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); @@ -5326,8 +5367,10 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, break; case BAM_CINS: for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), - p->qpos + j - p->is_del)]; + c = p->qpos + j - p->is_del < p->b->core.l_qseq + ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), + p->qpos + j - p->is_del)] + : 'N'; ins->s[indel++] = c; int nm; hts_base_mod mod[256]; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index c980e3b45..23e0ecaef 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -623,7 +623,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { if ( reader->file->format.format==vcf ) { - if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines + ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } @@ -641,7 +643,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } else if ( reader->tbx_idx ) { - if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines + ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } diff --git a/tabix.1 b/tabix.1 index 15ac768e7..ce5a47a39 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,10 +1,10 @@ -.TH tabix 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" +.TH tabix 1 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2014, 2016, 2018, 2020 Genome Research Ltd. +.\" Copyright (C) 2014, 2016, 2018, 2020, 2022 Genome Research Ltd. .\" .\" Author: Heng Li .\" @@ -101,7 +101,7 @@ start column. [5] Force to overwrite the index file if it is present. .TP .BI "-m, --min-shift " INT -set minimal interval size for CSI indices to 2^INT [14] +Set minimal interval size for CSI indices to 2^INT [14] .TP .BI "-p, --preset " STR Input format for indexing. Valid values are: gff, bed, sam, vcf. @@ -199,6 +199,5 @@ implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR bgzip (1), -.BR samtools (1) +.IR bgzip (1), +.IR samtools (1) diff --git a/tabix.c b/tabix.c index a79a7b968..0798b279f 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); diff --git a/tbx.c b/tbx.c index 3af2c09fb..d897a21f1 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li @@ -91,9 +91,10 @@ int tbx_name2id(tbx_t *tbx, const char *ss) return get_tid(tbx, ss, 0); } -int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) +int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) { - int i, b = 0, id = 1; + size_t i, b = 0; + int id = 1; char *s; intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { @@ -321,8 +322,11 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) continue; } if (first == 0) { - if (fmt == HTS_FMT_CSI) + if (fmt == HTS_FMT_CSI) { + if (!max_ref_len) + max_ref_len = (int64_t)100*1024*1024*1024; // 100G default n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); + } tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1; diff --git a/test/faidx/ce.1.expected.fa b/test/faidx/ce.1.expected.fa new file mode 100644 index 000000000..d606105c4 --- /dev/null +++ b/test/faidx/ce.1.expected.fa @@ -0,0 +1,8 @@ +>CHROMOSOME_I:5001-5125 length: 125 +AACTGGTTCAAAAACAAAAATTTTTTAAACTGTACAAACTGTCCAAAAAT +TCGTCGTAAATCGACACACCCTTCTCATTTTTTCAAAATTTTAATTGTTT +TCGAATGTTTTTTTTGCAGAATAAT +>CHROMOSOME_X:101-225 length: 125 +GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC +CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCT +AAGCCTAAGCCTAAGCCTAAGCCTA diff --git a/test/faidx/faidx.1.expected.fa b/test/faidx/faidx.1.expected.fa new file mode 100644 index 000000000..d14656e9f --- /dev/null +++ b/test/faidx/faidx.1.expected.fa @@ -0,0 +1,6 @@ +>trailingblank2:28-33 length: 6 +GGGCCC +>trailingblank3:4-5 length: 2 +TA +>bar:4-5 length: 2 +TA diff --git a/test/faidx.fa b/test/faidx/faidx.fa similarity index 100% rename from test/faidx.fa rename to test/faidx/faidx.fa diff --git a/test/faidx/faidx.fa.expected.fai b/test/faidx/faidx.fa.expected.fai new file mode 100644 index 000000000..b4d1aff26 --- /dev/null +++ b/test/faidx/faidx.fa.expected.fai @@ -0,0 +1,6 @@ + 4 2 4 5 +trailingblank1 33 23 12 13 +trailingblank2 72 111 24 25 +trailingblank3 5 234 4 6 +foo 8 252 6 7 +bar 8 280 8 9 diff --git a/test/faidx/faidx.tst b/test/faidx/faidx.tst new file mode 100644 index 000000000..b6bd7cac1 --- /dev/null +++ b/test/faidx/faidx.tst @@ -0,0 +1,74 @@ +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $bgzip and $test_faidx are replaced with the path to +# bgzip and test_faidx. + +# Index fasta +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -e faidx.fa.expected.fai + +# Test various functions on the fasta index +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_line_length -e 24 trailingblank2 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 1 foo +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_iseq -e trailingblank3 3 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len -e 33 trailingblank1 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len64 -e 72 trailingblank2 + +# Index fastq +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -e fastqs.fq.expected.fai + +# Test various functions on the fastq index +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 63 FAKE0005_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 144 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 1 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_iseq -e FAKE0005_1 0 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len -e 453 FSRRS4401CM938_1 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len64 -e 309 FSRRS4401AOV6A_4 + +# Fasta retrieval tests +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_fetch trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_fetch_seq64 trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_adjust_region trailingblank2:28-33 trailingblank3:4-5 bar:4-5 + +# Fastq retrieval tests +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_fetch FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t faidx_fetch_seq64 FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.2.expected.fa $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 + +# Indexing and retrieval on bgzip compressed fasta +INIT $bgzip -c < ../ce.fa > ce.fa.tmp.gz +P . $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi -e ../ce.fa.fai +P ce.1.expected.fa $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi CHROMOSOME_I:5001-5125 CHROMOSOME_X:101-225 diff --git a/test/faidx/fastqs.1.expected.fq b/test/faidx/fastqs.1.expected.fq new file mode 100644 index 000000000..729393837 --- /dev/null +++ b/test/faidx/fastqs.1.expected.fq @@ -0,0 +1,16 @@ +@FAKE0006_1:4-12 length: 9 +TGCATGCAT ++ +{zyxwvuts +@FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca ++ +c^^^YRPOSNVU\YTMMMSMRKKKRUUNNNNS[`aa```\ +@FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN ++ +I?5+I?5+I?5+I?5+I?5+I?5+I?5+I? +@SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA ++ +\aZ\d`OY[aY[[\[[e`WP diff --git a/test/faidx/fastqs.2.expected.fa b/test/faidx/fastqs.2.expected.fa new file mode 100644 index 000000000..9b67d15e7 --- /dev/null +++ b/test/faidx/fastqs.2.expected.fa @@ -0,0 +1,8 @@ +>FAKE0006_1:4-12 length: 9 +TGCATGCAT +>FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca +>FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN +>SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA diff --git a/test/fastqs.fq b/test/faidx/fastqs.fq similarity index 100% rename from test/fastqs.fq rename to test/faidx/fastqs.fq diff --git a/test/faidx/fastqs.fq.expected.fai b/test/faidx/fastqs.fq.expected.fai new file mode 100644 index 000000000..77ba04a5d --- /dev/null +++ b/test/faidx/fastqs.fq.expected.fai @@ -0,0 +1,105 @@ +FAKE0005_1 63 85 63 64 151 +FAKE0006_1 63 300 63 64 366 +FAKE0005_2 63 515 63 64 581 +FAKE0006_2 63 730 63 64 796 +FAKE0005_3 63 945 63 64 1011 +FAKE0006_3 63 1160 63 64 1226 +FAKE0005_4 63 1375 63 64 1441 +FAKE0006_4 63 1590 63 64 1656 +FSRRS4401BE7HA_1 395 1823 395 396 2221 +FSRRS4401BRRTC_1 145 2720 145 146 2868 +FSRRS4401B64ST_1 382 3118 382 383 3503 +FSRRS4401EJ0YH_1 381 3990 381 382 4374 +FSRRS4401BK0IB_1 507 4860 507 508 5370 +FSRRS4401ARCCB_1 258 5982 258 259 6243 +FSRRS4401CM938_1 453 6606 453 454 7062 +FSRRS4401EQLIK_1 411 7620 411 412 8034 +FSRRS4401AOV6A_1 309 8550 309 310 8862 +FSRRS4401EG0ZW_1 424 9276 424 425 9703 +FSRRS4401BE7HA_2 395 10231 395 396 10629 +FSRRS4401BRRTC_2 145 11128 145 146 11276 +FSRRS4401B64ST_2 382 11526 382 383 11911 +FSRRS4401EJ0YH_2 381 12398 381 382 12782 +FSRRS4401BK0IB_2 507 13268 507 508 13778 +FSRRS4401ARCCB_2 258 14390 258 259 14651 +FSRRS4401CM938_2 453 15014 453 454 15470 +FSRRS4401EQLIK_2 411 16028 411 412 16442 +FSRRS4401AOV6A_2 309 16958 309 310 17270 +FSRRS4401EG0ZW_2 424 17684 424 425 18111 +FSRRS4401BE7HA_3 395 18639 395 396 19037 +FSRRS4401BRRTC_3 145 19536 145 146 19684 +FSRRS4401B64ST_3 382 19934 382 383 20319 +FSRRS4401EJ0YH_3 381 20806 381 382 21190 +FSRRS4401BK0IB_3 507 21676 507 508 22186 +FSRRS4401ARCCB_3 258 22798 258 259 23059 +FSRRS4401CM938_3 453 23422 453 454 23878 +FSRRS4401EQLIK_3 411 24436 411 412 24850 +FSRRS4401AOV6A_3 309 25366 309 310 25678 +FSRRS4401EG0ZW_3 424 26092 424 425 26519 +FSRRS4401BE7HA_4 395 27047 80 81 27449 +FSRRS4401BRRTC_4 145 27952 80 81 28101 +FSRRS4401B64ST_4 382 28352 80 81 28741 +FSRRS4401EJ0YH_4 381 29232 80 81 29620 +FSRRS4401BK0IB_4 507 30110 80 81 30626 +FSRRS4401ARCCB_4 258 31244 80 81 31508 +FSRRS4401CM938_4 453 31874 80 81 32335 +FSRRS4401EQLIK_4 411 32898 80 81 33317 +FSRRS4401AOV6A_4 309 33838 80 81 34153 +FSRRS4401EG0ZW_4 424 34570 80 81 35002 +FAKE0007_1 41 35549 41 42 35593 +FAKE0008_1 41 35752 41 42 35796 +FAKE0009_1 41 35955 41 42 35999 +FAKE0010_1 30 36143 30 31 36176 +FAKE0007_2 41 36324 41 42 36368 +FAKE0008_2 41 36527 41 42 36571 +FAKE0009_2 41 36730 41 42 36774 +FAKE0010_2 30 36918 30 31 36951 +FAKE0007_3 41 37099 41 42 37143 +FAKE0008_3 41 37302 41 42 37346 +FAKE0009_3 41 37505 41 42 37549 +FAKE0010_3 30 37693 30 31 37726 +FAKE0007_4 41 37874 41 42 37918 +FAKE0008_4 41 38077 41 42 38121 +FAKE0009_4 41 38280 41 42 38324 +FAKE0010_4 30 38468 30 31 38501 +FAKE0011_1 41 38649 41 42 38693 +FAKE0012_1 41 38852 41 42 38896 +FAKE0013_1 41 39055 41 42 39099 +FAKE0014_1 30 39250 30 31 39283 +FAKE0011_2 41 39431 41 42 39475 +FAKE0012_2 41 39634 41 42 39678 +FAKE0013_2 41 39837 41 42 39881 +FAKE0014_2 30 40032 30 31 40065 +FAKE0011_3 41 40213 41 42 40257 +FAKE0012_3 41 40416 41 42 40460 +FAKE0013_3 41 40619 41 42 40663 +FAKE0014_3 30 40814 30 31 40847 +FAKE0011_4 41 40995 41 42 41039 +FAKE0012_4 41 41198 41 42 41242 +FAKE0013_4 41 41401 41 42 41445 +FAKE0014_4 30 41596 30 31 41629 +FAKE0001_1 94 41745 94 95 41842 +FAKE0002_1 94 42022 94 95 42119 +FAKE0001_2 94 42299 94 95 42396 +FAKE0002_2 94 42576 94 95 42673 +FAKE0001_3 94 42853 94 95 42950 +FAKE0002_3 94 43130 94 95 43227 +FAKE0001_4 94 43407 94 95 43504 +FAKE0002_4 94 43684 94 95 43781 +FAKE0003_1 68 43963 68 69 44034 +FAKE0004_1 68 44190 68 69 44261 +FAKE0003_2 68 44417 68 69 44488 +FAKE0004_2 68 44644 68 69 44715 +FAKE0003_3 68 44871 68 69 44942 +FAKE0004_3 68 45098 68 69 45169 +FAKE0003_4 68 45325 68 69 45396 +FAKE0004_4 68 45552 68 69 45623 +SRR014849.50939_1 135 45737 135 136 45875 +SRR014849.110027_1 131 46057 131 132 46191 +SRR014849.203935_1 144 46369 144 145 46516 +SRR014849.50939_2 135 46706 135 136 46844 +SRR014849.110027_2 131 47026 131 132 47160 +SRR014849.203935_2 144 47338 144 145 47485 +SRR014849.50939_3 135 47675 135 136 47813 +SRR014849.110027_3 131 47995 131 132 48129 +SRR014849.203935_3 144 48307 144 145 48454 diff --git a/test/faidx/test-faidx.sh b/test/faidx/test-faidx.sh new file mode 100755 index 000000000..ae501e086 --- /dev/null +++ b/test/faidx/test-faidx.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing faidx..." + +bgzip="../../bgzip" +test_faidx="../test_faidx" + +test_driver $@ + +exit $? diff --git a/test/modhdr.expected.vcf b/test/modhdr.expected.vcf new file mode 100644 index 000000000..bad663c7e --- /dev/null +++ b/test/modhdr.expected.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/test/modhdr.vcf.gz b/test/modhdr.vcf.gz new file mode 100644 index 000000000..f97e06ab3 Binary files /dev/null and b/test/modhdr.vcf.gz differ diff --git a/test/modhdr.vcf.gz.csi b/test/modhdr.vcf.gz.csi new file mode 100644 index 000000000..61b60e79a Binary files /dev/null and b/test/modhdr.vcf.gz.csi differ diff --git a/test/sam.c b/test/sam.c index 036349f2b..28ca1bc5f 100644 --- a/test/sam.c +++ b/test/sam.c @@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) return NULL; } +static void check_aux_count(const bam1_t *aln, int expected, const char *what) +{ + const uint8_t *itr; + int n = 0; + for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++; + if (n != expected) + fail("%s has %d aux fields, expected %d", what, n, expected); +} + static void check_int_B_array(bam1_t *aln, char *tag, uint32_t nvals, int64_t *vals) { uint8_t *p; @@ -285,10 +294,30 @@ static int aux_fields1(void) if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); + check_aux_count(aln, 24, "Original record"); + bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); + check_aux_count(aln, 23, "Record post-XA-deletion"); + + p = bam_aux_get(aln, "Y2"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i') + fail("bam_aux_get() missed Y2 field"); + + p = bam_aux_next(aln, p); + if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c') + fail("bam_aux_next() missed Y3 field"); + + p = bam_aux_get(aln, "Y8"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I') + fail("bam_aux_get() missed Y8 field"); + + p = bam_aux_next(aln, p); + if (p != NULL || errno != ENOENT) + fail("bam_aux_next missed the end of fields"); + if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); @@ -492,6 +521,16 @@ static int aux_fields1(void) if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); + + // Test field removal APIs -- after the strcmp(..., r1) check so that + // can also check the formatting of the to-be-removed fields. + + p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H')); + if (bam_aux_get(aln, "XH")) + fail("XH field was not removed"); + check_aux_count(aln, 31, "Record post-XH-removal"); + if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B') + fail("bam_aux_remove() missed XB field"); } else fail("can't read record"); @@ -2224,7 +2263,7 @@ int main(int argc, char **argv) test_text_file("test/emptyfile", 0); test_text_file("test/xx#pair.sam", 7); test_text_file("test/xx.fa", 7); - test_text_file("test/fastqs.fq", 500); + test_text_file("test/faidx/fastqs.fq", 500); check_enum1(); check_cigar_tab(); check_big_ref(0); diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index 129516b24..13d2c340e 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Genome Research Ltd. +# Copyright (C) 2020, 2022 Genome Research Ltd. # # Author: James Bonfield # @@ -42,17 +42,17 @@ P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam # Integer ops -P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' -P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' +P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam | grep -E -cv '^@' +P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam | grep -E -cv '^@' # Aux tags -P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' +P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam | grep -E -cv '^@' # Functions. -P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' -P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' -P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' -P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' -P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@' -P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | egrep -v '^@' -P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | egrep -v '^@' +P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | grep -E -cv '^@' +P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | grep -E -cv '^@' +P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | grep -E -cv '^@' +P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | grep -E -cv '^@' +P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | grep -E -v '^@' +P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | grep -E -v '^@' +P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | grep -E -v '^@' diff --git a/test/test.pl b/test/test.pl index d6c01786a..1595557a2 100755 --- a/test/test.pl +++ b/test/test.pl @@ -398,6 +398,30 @@ sub test_bgzip { } passed($opts,$test); + # Round-trip test of text in binary mode + $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', + $threads ? $threads : 'no'); + print "$test: "; + $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$data' '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ"); + return; + } + passed($opts,$test); + # Extract from an offset $test = sprintf('%s %2s threads', 'bgzip -b', $threads ? $threads : 'no'); @@ -637,14 +661,6 @@ sub test_view testv $opts, "./compare_sam.pl -Baux $md $sam $jsam"; } - # embed_ref=2 mode - my $ersam = "ce#1000.sam"; - my $ercram = "ce#1000_er.tmp.cram"; - my $ersam2 = "${ercram}.sam"; - testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; - testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; - testv $opts, "./compare_sam.pl $ersam $ersam2"; - if ($test_view_failures == 0) { passed($opts, "$sam conversions"); @@ -655,6 +671,21 @@ sub test_view } } + # embed_ref=2 mode + print "test_view testing embed_ref=2:\n"; + $test_view_failures = 0; + my $ersam = "ce#1000.sam"; + my $ercram = "ce#1000_er.tmp.cram"; + my $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { + passed($opts, "embed_ref=2 tests"); + } else { + failed($opts, "embed_ref=2 tests", "$test_view_failures subtests failed"); + } + # BAM and CRAM range queries on prebuilt BAM and CRAM # The cram file has @SQ UR: set to point to an invalid location to # force the reference to be reloaded from the one given on the @@ -916,6 +947,11 @@ sub test_vcf_various cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); test_cmd($opts, %args, out => "vcf_meta_meta.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf_meta_meta.vcf"); + + # VCF file with contig IDX=1, simulating an edited BCF file + # See htslib issue 1534 + test_cmd($opts, %args, out => "modhdr.expected.vcf", + cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2"); } sub write_multiblock_bgzf { diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 90ec167ac..a5084e6c6 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE. #include "../htslib/bgzf.h" #include "../htslib/hfile.h" +#include "../htslib/hts_log.h" #include "../hfile_internal.h" const char *bgzf_suffix = ".gz"; @@ -159,13 +160,19 @@ static BGZF * try_bgzf_hopen(const char *name, const char *mode, return bgz; } -static int try_bgzf_close(BGZF **bgz, const char *name, const char *func) { +static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int expected_fail) { BGZF *to_close = *bgz; *bgz = NULL; if (bgzf_close(to_close) != 0) { - fprintf(stderr, "%s : bgzf_close failed on %s : %s\n", - func, name, strerror(errno)); + if (!expected_fail) + fprintf(stderr, "%s : bgzf_close failed on %s%s%s\n", + func, name, + errno ? " : " : "", + errno ? strerror(errno) : ""); return -1; + } else if (expected_fail) { + fprintf(stderr, "%s : bgzf_close worked on %s, but expected failure\n", + func, name); } return 0; } @@ -398,6 +405,7 @@ static int test_read(Files *f) { ssize_t bg_got, f_got; unsigned char bg_buf[BUFSZ], f_buf[BUFSZ]; + errno = 0; bgz = try_bgzf_open(f->src_bgzf, "r", __func__); if (!bgz) return -1; @@ -414,7 +422,7 @@ static int test_read(Files *f) { } } while (bg_got > 0 && f_got > 0); - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) return -1; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) return -1; if (try_fseek_start(f->f_plain, f->src_plain, __func__) != 0) return -1; return 0; @@ -449,7 +457,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; switch (method) { case USE_BGZF_DOPEN: @@ -491,7 +499,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -521,7 +529,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, half, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Write second half. Append mode, so an EOF block should be in the @@ -535,7 +543,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Try reading pos = 0; @@ -564,7 +572,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -601,7 +609,7 @@ static int test_index_load_dump(Files *f) { } while (got_src > 0 && got_dest > 0); if (try_fclose(&fdest, f->tmp_idx, __func__) != 0) goto fail; - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -624,7 +632,7 @@ static int test_check_EOF(char *name, int expected) { return -1; } - return try_bgzf_close(&bgz, name, __func__); + return try_bgzf_close(&bgz, name, __func__, 0); } static int test_index_useek_getc(Files *f, const char *mode, @@ -651,7 +659,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -710,7 +718,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -741,7 +749,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -811,7 +819,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -841,7 +849,7 @@ static int test_tell_read(Files *f, const char *mode) { if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -859,7 +867,7 @@ static int test_tell_read(Files *f, const char *mode) { } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(bg_buf); return 0; @@ -885,11 +893,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + for (pos = 0; pos < f->ltext; ) { const char *end = strchr(text + pos, '\n'); size_t l = end ? end - (text + pos) : f->ltext - pos; @@ -909,12 +919,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { "Got : %.*s\n", __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, (int) str.l, str.s); + goto fail; } pos += l + 1; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(ks_release(&str)); return 0; @@ -924,6 +935,98 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { return -1; } +static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int nthreads) { + BGZF* bgz = NULL; + ssize_t bg_put; + size_t pos; + kstring_t str = { 0, 0, NULL }; + const char *text = (const char *) f->text; + + // Turn off bgzf errors as they're expected. + enum htsLogLevel lvl = hts_get_log_level(); + hts_set_log_level(HTS_LOG_OFF); + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + const char *text_line2 = strchr(text, '\n') + 1; + bg_put = try_bgzf_write(bgz, text, text_line2 - text, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block2_start = bgz->block_address; + + const char *text_line3 = strchr(text_line2, '\n') + 1; + bg_put = try_bgzf_write(bgz, text_line2, text_line3 - text_line2, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block3_start = bgz->block_address; + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; + + int64_t newsize; + for(newsize = block3_start - 1; newsize > block2_start; newsize--) { + //fprintf(stderr, "test_bgzf_getline_on_truncated_file : size truncated to %" PRId64 " with threads %d\n", newsize, nthreads); + + if (truncate(f->tmp_bgzf, newsize) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + for (pos = 0; pos < f->ltext; ) { + const char *end = strchr(text + pos, '\n'); + size_t l = end ? end - (text + pos) : f->ltext - pos; + + int res = bgzf_getline(bgz, '\n', &str); + if (res < -1) { + // ok, we expect error from truncated file + break; + } else if (res == -1) { + // truncated file should never return EOF since we do not truncate at block boundary + fprintf(stderr, "%s : %s from bgzf_getline on %s\n", + __func__, "Unexpected EOF", + f->tmp_bgzf); + goto fail; + } + + if (str.l != l || memcmp(text + pos, str.s, l) != 0) { + fprintf(stderr, + "%s : Unexpected data from bgzf_getline on %s\n" + "Expected : %.*s\n" + "Got : %.*s\n", + __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, + (int) str.l, str.s); + goto fail; + } + pos += l + 1; + } + + // verify error is persistent + int k; + for(k = 0; k < 3; k++) { + int res = bgzf_getline(bgz, '\n', &str); + if (res > -2) { + fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res); + goto fail; + } + } + // closing a stream with error returns error + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 1) == 0) goto fail; + } + free(ks_release(&str)); + hts_set_log_level(lvl); + return 0; + + fail: + hts_set_log_level(lvl); + if (bgz) bgzf_close(bgz); + free(ks_release(&str)); + return -1; +} + int main(int argc, char **argv) { Files f = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0 }; int retval = EXIT_FAILURE; @@ -1000,6 +1103,10 @@ int main(int argc, char **argv) { if (test_bgzf_getline(&f, "w", 1) != 0) goto out; if (test_bgzf_getline(&f, "w", 2) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 0) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 1) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 2) != 0) goto out; + retval = EXIT_SUCCESS; out: diff --git a/test/test_faidx.c b/test/test_faidx.c new file mode 100644 index 000000000..566149071 --- /dev/null +++ b/test/test_faidx.c @@ -0,0 +1,516 @@ +/* test/test_fadix.c -- Test faidx interfaces + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include + +#include "../htslib/faidx.h" + +int file_compare(const char *file1, const char *file2) { + FILE *f1 = NULL; + FILE *f2 = NULL; + unsigned int lno = 1; + size_t got1, got2, i; + char buf1[1024], buf2[1024]; + int ret = -1; + + f1 = fopen(file1, "rb"); + if (!f1) { + perror(file1); + goto out; + } + f2 = fopen(file2, "rb"); + if (!f2) { + perror(file2); + goto out; + } + + do { + got1 = fread(buf1, 1, sizeof(buf1), f1); + got2 = fread(buf2, 1, sizeof(buf2), f2); + + for (i = 0; i < got1 && i < got2 && buf1[i] == buf2[i]; i++) + lno += (buf1[i] == '\n'); + if (i < got1 || i < got2) { + fprintf(stderr, "%s and %s differ at line %u\n", + file1, file2, lno); + goto out; + } + } while (got1 > 0 && got2 > 0); + + if (ferror(f1)) { + perror(file1); + goto out; + } + if (ferror(f2)) { + perror(file2); + goto out; + } + + if (got1 > 0 || got2 > 0) { + fprintf(stderr, "EOF on %s at line %u\n", + got1 ? file2 : file1, lno); + goto out; + } + + ret = 0; + out: + if (f1) fclose(f1); + if (f2) fclose(f2); + return ret; +} + +faidx_t * load_index(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format) { + faidx_t *fai = fai_load3_format(fn, fnfai, fngzi, flags, format); + if (!fai) { + fprintf(stderr, "Failed: fai_load3(%s, %s, %s, %d, %d)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL", flags, + (int) format); + return NULL; + } + return fai; +} + +int do_retrieval(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format, const char *fnout, + const char *interface, int nreg, char **regions) { + int i, use_64bit = 1, use_parse_reg = 0, use_adjust_reg = 0; + faidx_t *fai = NULL; + FILE *out = stdout; + + if (interface) { + if (strcmp(interface, "fai_fetch") == 0) { + use_64bit = 0; + } else if (strcmp(interface, "faidx_fetch_seq") == 0) { + use_64bit = 0; + use_parse_reg = 1; + } else if (strcmp(interface, "faidx_fetch_seq64") == 0 + || strcmp(interface, "fai_parse_region") == 0) { + use_parse_reg = 1; + } else if (strcmp(interface, "fai_adjust_region") == 0) { + use_parse_reg = 1; + use_adjust_reg = 1; + } + } + + if (fnout) { + out = fopen(fnout, "wb"); + if (!out) { + perror(fnout); + return -1; + } + } + + fai = load_index(fn, fnfai, fngzi, flags, format); + if (!fai) + goto fail; + + for (i = 0; i < nreg; i++) { + hts_pos_t len = 0, pos, beg = 0, end = 0; + int tid = 0; + char *seq = NULL; + size_t l; + + if (use_parse_reg) { + const char *e = fai_parse_region(fai, regions[i], + &tid, &beg, &end, 0); + if (e == NULL) { + fprintf(stderr, "Failed: " + "fai_parse_region(fai, %s, &tid, &beg, &end, 0)\n", + regions[i]); + goto fail; + } + if (use_adjust_reg) { + hts_pos_t orig_beg = beg, orig_end = end; + int r = fai_adjust_region(fai, tid, &beg, &end); + if (r < 0 + || (((r & 1) != 0) ^ (beg != orig_beg)) + || (((r & 2) != 0) ^ (end != orig_end))) { + fprintf(stderr, "Failed: fai_adjust_region(fai, %d, " + "%"PRIhts_pos", %"PRIhts_pos") returned %d\n" + "After: beg = %"PRIhts_pos" end = %"PRIhts_pos"\n", + tid, orig_beg, orig_end, r, beg, end); + goto fail; + } + } + if (use_64bit) { + seq = faidx_fetch_seq64(fai, faidx_iseq(fai, tid), + beg, end - 1, &len); + } else { + int ilen = 0; + seq = faidx_fetch_seq(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: faidx_fetch_seq%s(fai, %s, " + "%"PRIhts_pos", %"PRIhts_pos", &len)\n", + use_64bit ? "64" : "", faidx_iseq(fai, tid), beg, end); + goto fail; + } + } else { + if (use_64bit) { + seq = fai_fetch64(fai, regions[i], &len); + } else { + int ilen = 0; + seq = fai_fetch(fai, regions[i], &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: fai_fetch%s(fai, %s, &len)\n", + use_64bit ? "64" : "", regions[i]); + goto fail; + } + } + + l = strlen(seq); + fprintf(out, "%c%s length: %"PRIhts_pos"\n", + format == FAI_FASTQ ? '@' : '>', regions[i], len); + for (pos = 0; pos < l; pos += 50) { + fprintf(out, "%.*s\n", 50, seq + pos); + } + free(seq); + if (format == FAI_FASTQ) { + hts_pos_t qual_len = 0; + char *qual; + if (use_parse_reg) { + if (use_64bit) { + qual = faidx_fetch_qual64(fai, faidx_iseq(fai, tid), + beg, end - 1, &qual_len); + } else { + int ilen = 0; + qual = faidx_fetch_qual(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + qual_len = ilen; + } + } else { + if (use_64bit) { + qual = fai_fetchqual64(fai, regions[i], &qual_len); + } else { + int ilen = 0; + qual = fai_fetchqual(fai, regions[i], &ilen); + qual_len = ilen; + } + if (!qual) { + fprintf(stderr, "Failed: fai_fetchqual64(fai, %s, &len)\n", + regions[i]); + goto fail; + } + } + if (qual_len != len) { + fprintf(stderr, + "Sequence and quality lengths differ for %s %s\n", + fn, regions[i]); + free(qual); + goto fail; + } + fprintf(out, "+\n"); + l = strlen(qual); + for (pos = 0; pos < l; pos+=50) { + fprintf(out, "%.*s\n", 50, qual + pos); + } + free(qual); + } + } + + fai_destroy(fai); + + if (fnout) { + if (fclose(out) != 0) { + perror(fnout); + return -1; + } + } + return 0; + + fail: + if (fai) + fai_destroy(fai); + if (fnout) + fclose(out); + + return -1; +} + +int test_fai_line_length(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *reg) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = fai_line_length(fai, reg); + fai_destroy(fai); + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos" from " + "fai_line_length, expected %s\n", found_len, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + return 0; +} + +int test_faidx_has_seq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int res; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + res = faidx_has_seq(fai, seq); + fai_destroy(fai); + if (expected) { + long exp_res = strtol(expected, NULL, 10); + if (res != exp_res) { + fprintf(stderr, "Unexpected result %d from faidx_has_seq(%s) " + "expected %s\n", res, seq, expected); + return -1; + } + } else { + printf("%d\n", res); + } + return 0; +} + +int test_faidx_iseq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *index) { + const char *found_name = NULL; + int idx = atoi(index); + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_name = faidx_iseq(fai, idx); + + if (expected) { + if (!found_name || strcmp(found_name, expected) != 0) { + fprintf(stderr, "Unexpected result %s from faidx_iseq(fai, %d), " + "expected %s\n", found_name ? found_name : "(null)", + idx, expected); + fai_destroy(fai); + return -1; + } + } else { + printf("%s\n", found_name ? found_name : "(null)"); + } + + fai_destroy(fai); + return 0; +} + +int test_faidx_seq_len(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + int exp_len = atoi(expected); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %d from faidx_seq_len(fai, %s) " + "expected %s\n", found_len, seq, expected); + return -1; + } + } else { + printf("%d\n", found_len); + } + + return 0; +} + +int test_faidx_seq_len64(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos + " from fai_seq_len64(fai, %s) expected %s\n", + found_len, seq, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + + return 0; +} + +void usage(FILE *out, const char *arg0) { + fprintf(out, + "Usage: %s [-c] -i fasta/q [-f fai_file] [-g gzi_file] [-e expected_fai]\n" + " %s [-cQ] -i fasta/q [-f fai_file] [-g gzi_file] [region]\n" + " %s -t FUNC -i fasta/q [-f fai_file] [-g gzi_file] [-e expected] \n" + " %s -h\n", + arg0, arg0, arg0, arg0); +} + +void help(FILE *out, const char *arg0) { + usage(out, arg0); + fprintf(out, + "Options:\n" + " -i FILE Input file\n" + " -f FILE Fasta/q index file name\n" + " -g FILE Bgzip index file name\n" + " -o FILE Output file name\n" + " -e FILE|STR Expected output\n" + " -c Set FAI_CREATE flag\n" + " -Q Output fastq format\n" + " -t FUNC Test function\n" + " -h Print this help\n" + "\n" + "Expected output is compared to the FAI file in indexing mode;" + " the output file\n" + "in retrieval mode; " + "expected output for various -t function tests.\n" + "\n" + "Unit tests (-t option):\n" + " fai_line_length, faidx_has_seq, faidx_iseq, faidx_seq_len, faidx_seq_len64\n" + "In retrieval mode, -t can change the functions used to fetch data:\n" + " fai_fetch, fai_fetch64, faidx_fetch_seq, faidx_fetch_seq64,\n" + " fai_parse_region, fai_adjust_region\n" + "\n"); +} + +int main(int argc, char **argv) { + int opt; + const char *fn = NULL; + const char *fnout = NULL; + const char *fnfai = NULL; + const char *fngzi = NULL; + const char *expected = NULL; + const char *func = ""; + int flags = 0; + enum fai_format_options format = FAI_FASTA; + int res; + + while ((opt = getopt(argc, argv, "i:f:g:o:e:t:cQh")) > 0) { + switch (opt) { + case 'i': + fn = optarg; + break; + case 'f': + fnfai = optarg; + break; + case 'g': + fngzi = optarg; + break; + case 'o': + fnout = optarg; + break; + case 'e': + expected = optarg; + break; + case 'c': + flags |= FAI_CREATE; + break; + case 'Q': + format = FAI_FASTQ; + break; + case 't': + func = optarg; + break; + case 'h': + help(stdout, argv[0]); + return EXIT_SUCCESS; + default: + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + } + + if (!fn) { + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + + if (optind == argc) { + // Index building mode + res = fai_build3(fn, fnfai, fngzi); + if (res) { + fprintf(stderr, "Failed: fai_build3(%s, %s, %s)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL"); + } else if (expected) { + res = file_compare(fnfai, expected); + } + } else { + if (strcmp(func, "fai_line_length") == 0) { + res = test_fai_line_length(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_has_seq") == 0) { + res = test_faidx_has_seq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_iseq") == 0) { + res = test_faidx_iseq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len") == 0) { + res = test_faidx_seq_len(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len64") == 0) { + res = test_faidx_seq_len64(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else { + res = do_retrieval(fn, fnfai, fngzi, flags, format, fnout, + func, argc - optind, &argv[optind]); + if (res == 0 && fnout && expected) { + res = file_compare(fnout, expected); + } + } + } + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/test/test_view.c b/test/test_view.c index 416ffe98d..f33c1cdf0 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -224,7 +224,8 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); - continue; + exit_code = 1; + break; } while ((r = bcf_itr_next(in, iter, b)) >= 0) { if (!opts->benchmark && bcf_write1(out, h, b) < 0) { diff --git a/vcf.c b/vcf.c index 56af63054..59d433c19 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -51,6 +51,10 @@ DEALINGS IN THE SOFTWARE. */ KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) typedef khash_t(vdict) vdict_t; +KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) +typedef khash_t(hdict) hdict_t; + + #include "htslib/kseq.h" HTSLIB_EXPORT uint32_t bcf_float_missing = 0x7F800001; @@ -79,6 +83,22 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_IS_64BIT (1<<30) +// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI. +// Note that this preserving API and ABI requires that the first element is vdict_t struct +// rather than a pointer, as user programs may (and in some cases do) access the dictionary +// directly as (vdict_t*)hdr->dict. +typedef struct +{ + vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT + hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields +} +bcf_hdr_aux_t; + +static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) +{ + return (bcf_hdr_aux_t *)hdr->dict[0]; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -93,9 +113,6 @@ static char *find_chrom_header_line(char *s) static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) { - if ( !s ) return 0; - if (len == 0) len = strlen(s); - const char *ss = s; while ( *ss && isspace_c(*ss) && ss - s < len) ss++; if ( !*ss || ss - s == len) @@ -140,7 +157,12 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) { - return bcf_hdr_add_sample_len(h, s, 0); + if (!s) { + // Allowed for backwards-compatibility, calling with s == NULL + // used to trigger bcf_hdr_sync(h); + return 0; + } + return bcf_hdr_add_sample_len(h, s, strlen(s)); } int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str) @@ -864,8 +886,45 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } +int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) +{ + // currently only for bcf_hdr_set_version + assert( hrec->type==BCF_HL_GEN ); + int ret; + khint_t k; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + for (k=kh_begin(aux->gen); kgen); k++) + { + if ( !kh_exist(aux->gen,k) ) continue; + if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue; + break; + } + assert( kgen) ); // something went wrong, should never happen + free((char*)kh_key(aux->gen,k)); + kh_del(hdict,aux->gen,k); + kstring_t str = {0,0,0}; + if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 ) + { + free(str.s); + return -1; + } + k = kh_put(hdict, aux->gen, str.s, &ret); + if ( ret<0 ) + { + free(str.s); + return -1; + } + free(hrec->value); + hrec->value = strdup(tmp->value); + if ( !hrec->value ) return -1; + return 0; +} + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { + kstring_t str = {0,0,0}; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + int res; if ( !hrec ) return 0; @@ -883,19 +942,49 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } // Is one of the generic fields and already present? - int i; - for (i=0; inhrec; i++) + if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) + { + free(str.s); + return -1; + } + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) + { + // duplicate record + bcf_hrec_destroy(hrec); + free(str.s); + return 0; + } + } + + int i; + if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 ) + { + if ( ksprintf(&str, "##%s=", hrec->key,hrec->vals[i]) < 0 ) { - if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break; + free(str.s); + return -1; } - if ( inhrec ) + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) { + // duplicate record bcf_hrec_destroy(hrec); + free(str.s); return 0; } } + if ( str.s ) + { + khint_t k = kh_put(hdict, aux->gen, str.s, &res); + if ( res<0 ) + { + bcf_hrec_destroy(hrec); + free(str.s); + return -1; + } + kh_val(aux->gen,k) = hrec; + } // New record, needs to be added int n = hdr->nhrec + 1; @@ -909,29 +998,47 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return hrec->type==BCF_HL_GEN ? 0 : 1; } -/* - * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed), - * the STR,GEN lines are searched for linearly in a linked list of all header lines. - * This may become a problem for VCFs with huge headers, we might need to build a - * dictionary for these lines as well. - */ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) { int i; if ( type==BCF_HL_GEN ) { + // e.g. ##fileformat=VCFv4.2 + // ##source=GenomicsDBImport + // ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364 + if ( value ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=%s", key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; if ( strcmp(hdr->hrec[i]->key,key) ) continue; - if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i]; + return hdr->hrec[i]; } return NULL; } else if ( type==BCF_HL_STR ) { - if (!str_class) - return NULL; + // e.g. ##GATKCommandLine= + // ##ALT= + if (!str_class) return NULL; + if ( !strcmp("ID",key) ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=<%s=%s>",str_class,key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; @@ -1068,6 +1175,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) bcf_hrec_t *hrec; if ( !key ) { + // no key, remove all entries of this type while ( inhrec ) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } @@ -1183,14 +1291,19 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) { int len; kstring_t str = {0,0,0}; - ksprintf(&str,"##fileformat=%s", version); + if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); } else { - free(hrec->value); - hrec->value = strdup(version); + bcf_hrec_t *tmp = bcf_hrec_dup(hrec); + if ( !tmp ) return -1; + free(tmp->value); + tmp->value = strdup(version); + if ( !tmp->value ) return -1; + bcf_hdr_update_hrec(hdr, hrec, tmp); + bcf_hrec_destroy(tmp); } hdr->dirty = 1; return 0; // FIXME: check for errs in this function (return < 0 if so) @@ -1204,6 +1317,14 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if (!h) return NULL; for (i = 0; i < 3; ++i) if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; + + bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); + if ( !aux ) goto fail; + if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } + aux->dict = *((vdict_t*)h->dict[0]); + free(h->dict[0]); + h->dict[0] = aux; + if ( strchr(mode,'w') ) { bcf_hdr_append(h, "##fileformat=VCFv4.2"); @@ -1229,6 +1350,13 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (d == 0) continue; for (k = kh_begin(d); k != kh_end(d); ++k) if (kh_exist(d, k)) free((char*)kh_key(d, k)); + if ( i==0 ) + { + bcf_hdr_aux_t *aux = get_hdr_aux(h); + for (k=kh_begin(aux->gen); kgen); k++) + if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); + kh_destroy(hdict, aux->gen); + } kh_destroy(vdict, d); free(h->id[i]); } @@ -1974,7 +2102,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) // header. At this point, the header must have been printed, // proceeding would lead to a broken BCF file. Errors must be checked // and cleared by the caller before we can proceed. - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1); + char errdescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1); return -1; } bcf1_sync(v); // check if the BCF record was modified @@ -2211,20 +2340,44 @@ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) { vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - int tid, m = kh_size(d); + int i, tid, m = kh_size(d); const char **names = (const char**) calloc(m,sizeof(const char*)); + if ( !names ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + return NULL; + } khint_t k; for (k=kh_begin(d); k= m ) + { + // This can happen after a contig has been removed from BCF header via bcf_hdr_remove() + if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + free(names); + return NULL; + } + m = tid + 1; + } names[tid] = kh_key(d,k); } - // sanity check: there should be no gaps - for (tid=0; tid 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, t); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; @@ -2915,7 +3068,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (res < 0) bcf_hrec_destroy(hrec); if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, key); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; @@ -3410,7 +3563,7 @@ int vcf_write_line(htsFile *fp, kstring_t *line) int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - int ret; + ssize_t ret; fp->line.l = 0; if (vcf_format1(h, v, &fp->line) != 0) return -1; @@ -3815,7 +3968,8 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) int i; if ( line->errcode ) { - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1); + char errordescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)), bcf_seqname_safe(src_hdr,line), line->pos+1); exit(1); } if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id @@ -5047,3 +5201,75 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v #undef BRANCH return nsmpl*fmt->n; } + +//error description structure definition +typedef struct err_desc { + int errorcode; + const char *description; +}err_desc; + +// error descriptions +static const err_desc errdesc_bcf[] = { + { BCF_ERR_CTG_UNDEF, "Contig not defined in header"}, + { BCF_ERR_TAG_UNDEF, "Tag not defined in header" }, + { BCF_ERR_NCOLS, "Incorrect number of columns" }, + { BCF_ERR_LIMITS, "Limits reached" }, + { BCF_ERR_CHAR, "Invalid character" }, + { BCF_ERR_CTG_INVALID, "Invalid contig" }, + { BCF_ERR_TAG_INVALID, "Invalid tag" }, +}; + +/// append given description to buffer based on available size and add ... when not enough space + /** @param buffer buffer to which description to be appended + @param offset offset at which to be appended + @param maxbuffer maximum size of the buffer + @param description the description to be appended +on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site +on success returns 0 + */ +static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) { + + if (!description || !buffer || !offset || (maxbuffer < 4)) + return -1; + + size_t rembuffer = maxbuffer - *offset; + if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) { //add description with optionally required ',' + *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description); + } else { //not enough space for description, put ... + size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset; + snprintf(buffer + tmppos, 4, "..."); //ignore offset update + return -1; + } + return 0; +} + +//get description for given error code. return NULL on error +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { + size_t usedup = 0; + int ret = 0; + int idx; + + if (!buffer || maxbuffer < 4) + return NULL; //invalid / insufficient buffer + + if (!errorcode) { + buffer[0] = '\0'; //no error, set null + return buffer; + } + + for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { + if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description + ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description); + if (ret < 0) + break; //not enough space, ... added, no need to continue + + errorcode &= ~errdesc_bcf[idx].errorcode; //reset the error + } + } + + if (errorcode && (ret >= 0)) { //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§ + add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error"); + } + return buffer; +} + diff --git a/version.sh b/version.sh index 89d57fcf6..65d1ccae6 100755 --- a/version.sh +++ b/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.16 +VERSION=1.17 # If we have a git clone, then check against the current tag srcdir=${0%/version.sh}