diff --git a/.cirrus.yml b/.cirrus.yml index 83e36af9c..3a7b910a5 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -72,7 +72,7 @@ gcc_task: USE_CONFIG: no - environment: USE_CONFIG: yes - CFLAGS: -std=c99 -pedantic + CFLAGS: -std=c99 -pedantic -Wformat=2 USE_LIBDEFLATE: yes << : *LIBDEFLATE @@ -84,8 +84,8 @@ gcc_task: ubuntu_task: name: ubuntu-clang container: - #image: ubuntu:latest # use << : *LIBDEFLATE - image: ubuntu:devel + image: ubuntu:latest + # image: ubuntu:devel cpu: 2 memory: 1G @@ -121,7 +121,7 @@ ubuntu_task: rocky_task: name: rockylinux-gcc container: - image: rockylinux:latest + image: rockylinux:9 cpu: 2 memory: 1G @@ -133,9 +133,36 @@ rocky_task: # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. install_script: | - yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ - bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ - diffutils git + yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \ + zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \ + ncurses-devel diffutils git + + << : *COMPILE + << : *TEST + +# Arm Linux +arm_ubuntu_task: + name: ubuntu-arm + arm_container: + image: ubuntu:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + DO_UNTRACKED_FILE_CHECK: yes + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -std=c99 -pedantic + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev << : *COMPILE << : *TEST diff --git a/.gitignore b/.gitignore index 1573a5bf7..6b58e8439 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ shlib-exports-*.txt /test/tabix/FAIL* /test/test-bcf-sr /test/test-bcf-translate +/test/test-bcf_set_variant_type /test/test_bgzf /test/test_expr /test/test_index @@ -67,6 +68,7 @@ shlib-exports-*.txt /test/test_realn /test/test-regidx /test/test_str2int +/test/test_time_funcs /test/test-vcf-api /test/test-vcf-sweep /test/test_view diff --git a/Makefile b/Makefile index 0871580ce..374141898 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ CPPFLAGS = #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 CFLAGS = -g -Wall -O2 -fvisibility=hidden EXTRA_CFLAGS_PIC = -fpic +TARGET_CFLAGS = LDFLAGS = -fvisibility=hidden LIBS = $(htslib_default_libs) @@ -85,6 +86,7 @@ BUILT_TEST_PROGRAMS = \ test/test_realn \ test/test-regidx \ test/test_str2int \ + test/test_time_funcs \ test/test_view \ test/test_index \ test/test-vcf-api \ @@ -93,7 +95,8 @@ BUILT_TEST_PROGRAMS = \ test/fuzz/hts_open_fuzzer.o \ test/test-bcf-translate \ test/test-parse-reg \ - test/test_introspection + test/test_introspection \ + test/test-bcf_set_variant_type BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -114,10 +117,16 @@ ALL_CPPFLAGS = -I. $(CPPFLAGS) htscodecs.mk: echo '# Default htscodecs.mk generated by Makefile' > $@ echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ + $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ srcdir = . srcprefix = HTSPREFIX = + +HTS_CFLAGS_AVX2 = +HTS_CFLAGS_AVX512 = +HTS_CFLAGS_SSE4 = + include htslib_vars.mk include htscodecs.mk @@ -131,8 +140,8 @@ LIBHTS_SOVERSION = 3 # is not strictly necessary and should be removed the next time # LIBHTS_SOVERSION is bumped (see #1144 and # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) -MACH_O_COMPATIBILITY_VERSION = 3.1.15 -MACH_O_CURRENT_VERSION = 3.1.15 +MACH_O_COMPATIBILITY_VERSION = 3.1.16 +MACH_O_CURRENT_VERSION = 3.1.16 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -161,10 +170,10 @@ config_vars.h: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ @@ -226,6 +235,7 @@ bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +hts_time_funcs_h = hts_time_funcs.h sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -253,6 +263,20 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ + if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \ + echo '#define HAVE_POPCNT 1' >> $@ ; \ + echo '#define HAVE_SSE4_1 1' >> $@ ; \ + echo '#define HAVE_SSSE3 1' >> $@ ; \ + echo '#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0' >> $@ ; \ + echo '#define UBSAN 1' >> $@ ; \ + echo '#endif' >> $@ ; \ + fi + if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \ + echo '#define HAVE_AVX2 1' >> $@ ; \ + fi + if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \ + echo '#define HAVE_AVX512 1' >> $@ ; \ + fi # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate @@ -293,6 +317,9 @@ endif BUILT_PLUGINS = $(PLUGIN_OBJS:.o=$(PLUGIN_EXT)) +ifneq "$(BUILT_PLUGINS)" "" +plugins: lib-shared +endif plugins: $(BUILT_PLUGINS) @@ -302,6 +329,10 @@ libhts.a: $(LIBHTS_OBJS) -$(RANLIB) $@ print-config: + @echo HTS_CFLAGS_AVX2 = $(HTS_CFLAGS_AVX2) + @echo HTS_CFLAGS_AVX512 = $(HTS_CFLAGS_AVX512) + @echo HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSE4) + @echo HTS_HAVE_NEON = $(HTS_HAVE_NEON) @echo LDFLAGS = $(LDFLAGS) @echo LIBHTS_OBJS = $(LIBHTS_OBJS) @echo LIBS = $(LIBS) @@ -399,9 +430,9 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) +hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h) hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) -hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) +hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) @@ -423,7 +454,7 @@ textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstr cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) -cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) +cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) @@ -435,14 +466,24 @@ cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/st thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) -htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr.o htscodecs/htscodecs/rANS_static32x16pr.pico: htscodecs/htscodecs/rANS_static32x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: htscodecs/htscodecs/rANS_static32x16pr_avx2.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_permute_h) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: htscodecs/htscodecs/rANS_static32x16pr_avx512.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_neon.o htscodecs/htscodecs/rANS_static32x16pr_neon.pico: htscodecs/htscodecs/rANS_static32x16pr_neon.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: htscodecs/htscodecs/rANS_static32x16pr_sse4.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) $(htscodecs_rANS_static32x16pr_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) -htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) +htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h) +# Extra CFLAGS for specific files +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX2) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX512) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: TARGET_CFLAGS = $(HTS_CFLAGS_SSE4) bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread @@ -525,16 +566,21 @@ SRC = $(srcprefix) # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODECS_TEST_TARGETS) +check test: all $(HTSCODECS_TEST_TARGETS) test/hts_endian test/test_expr test/test_kfunc test/test_kstring test/test_str2int + test/test_time_funcs test/fieldarith test/fieldarith.sam test/hfile - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR) - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR) + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \ + fi + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR); \ + fi test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam cd test/sam_filter && ./filter.sh filter.tst @@ -597,6 +643,9 @@ test/test-parse-reg: test/test-parse-reg.o libhts.a test/test_str2int: test/test_str2int.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_str2int.o libhts.a $(LIBS) -lpthread +test/test_time_funcs: test/test_time_funcs.o + $(CC) $(LDFLAGS) -o $@ test/test_time_funcs.o + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread @@ -618,6 +667,9 @@ test/test-bcf-translate: test/test-bcf-translate.o libhts.a test/test_introspection: test/test_introspection.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread +test/test-bcf_set_variant_type: test/test-bcf_set_variant_type.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-bcf_set_variant_type.o libhts.a $(LIBS) -lpthread + # Extra tests for bundled htscodecs test_htscodecs_rans4x8: htscodecs/tests/rans4x8 cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test @@ -685,6 +737,7 @@ test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_s test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) +test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) @@ -692,6 +745,7 @@ test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) +test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/NEWS b/NEWS index 83c9978c4..7ae851a47 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,172 @@ +Noteworthy changes in release 1.16 (18th August 2022) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Make hfile_s3 refresh AWS credentials on expiry in order to make HTSlib work + better with AWS IAM credentials, which have a limited lifespan. + (PR#1462 and PR#1474, addresses #344) + +* Allow BAM headers between 2GB and 4GB in size once more. This is not + permitted in the BAM specification but was allowed in an earlier version of + HTSlib. There is now a warning at 2GB and a hard failure at 4GB. + (PR#1421, fixes #1420 and samtools#1613. Reported by John Marshall and + R C Mueller) + +* Improve error message when failing to load an index. + (PR#1468, example of the problem samtools#1637) + +* Permit MM (base modification) tags containing "." and "?" suffixes. These + define implicit vs explicit coordinates. See the SAM tags specification for + details. + (PR#1423 and PR#1426, fixes #1418. PR#1469, fixes #1466. Reported + by cjw85) + +* Warn if spaces instead of tabs are detected in a VCF file to prevent + confusion. + (PR#1328, fixes bcftools#1575. Reported by ketkijoshi278) + +* Add an "sclen" filter expression keyword. This is the length of a soft-clip, + both left and right end. It may be combined with qlen (qlen-sclen) to obtain + the number of bases in the query sequence that have been aligned to the genome + ie it provides a way to compare local-alignment vs global-alignment length. + (PR#1441 and PR/samtools#1661, fixes #1436. Requested by Chang Y) + +* Improve error messages for CRAM reference mismatches. If the user specifies + the wrong reference, the CRAM slice header MD5sum checks fail. We now report + the SQ line M5 string too so it is possible to validate against the whole + chr in the ref.fa file. The error message has also been improved to report + the reference name instead of #num. Finally, we now hint at the likely cause, + which counters the misleading samtools supplied error of "truncated or + corrupt" file. + (PR#1427, fixes samtools#1640. Reported by Jian-Guo Zhou) + +* Expose more of the CRAM API and add new functionality to extract the reference + from a CRAM file. + (PR#1429 and PR#1442) + +* Improvements to the implementation of embedded references in CRAM where no + external reference is specified. + (PR#1449, addresses some of the issues in #1445) + +* The CRAM writer now allows alignment records with RG:Z: aux tags that + don't have a corresponding @RG ID in the file header. Previously these + tags would have been silently dropped. HTSlib will complain whenever it + has to add one though, as such tags do not conform to recommended practice + for the SAM, BAM and CRAM formats. + (PR#1480, fixes #1479. Reported by Alex Leonard) + +* Set tab delimiter in man page for tabix GFF3 sort. + (PR#1457. Thanks to Colin Diesh) + +* When using libdeflate, the 1...9 scale of BGZF compression levels is + now remapped to the 1...12 range used by libdeflate instead of being + passed directly. In particular, HTSlib levels 8 and 9 now map to + libdeflate levels 10 and 12, so it is possible to select the highest (but + slowest) compression offered by libdeflate. + (PR#1488, fixes #1477. Reported by Gert Hulselmans) + +* The VCF variant API has been extended so that it can return separate flags + for INS and DEL variants as well as the existing INDEL one. These flags + have not been added to the old bcf_get_variant_types() interface as + it could break existing users. To access them, it is necessary to use new + functions bcf_has_variant_type() and bcf_has_variant_types(). + (PR#1467) + +* The missing, but trivial, `le_to_u8()` function has been added to hts_endian. + (PR#1494, Thanks to John Marshall) + +* bcf_format_gt() now works properly on big-endian platforms. + (PR#1495, Thanks to John Marshall) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Update htscodecs to version 1.3.0 for new SIMD code + various fixes. + Updates the htscodecs submodule and adds changes necessary to make HTSlib + build the new SIMD codec implementations. + (PR#1438, PR#1489, PR#1500) + +* Fix clang builds under mingw. Under mingw, clang requires dllexport to be + applied to both function declarations and function definitions. + (PR#1435, PR#1497, PR#1498 fixes #1433. Reported by teepean) + +* Fix curl type warning with gcc 12.1 on Windows. + (PR#1443) + +* Detect ARM Neon support and only build appropriate SIMD object files. + (PR#1451, fixes #1450. Thanks to John Marshall) + +* `make print-config` now reports extra CFLAGS that are needed to build the + SIMD parts of htscodecs. These may be of use to third-party build + systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485. + Thanks to John Marshall) + +* Fixed some Makefile dependency issues for the "check"/"test" targets + and plugins. In particular, "make check" will now build the "all" target, + if not done already, before running the tests. + (PR#1496) + +Bug fixes +--------- + +* Fix bug when reading position -1 in BCF (0 in VCF), which is used to indicate + telomeric regions. The BCF reader was incorrectly assuming the value stored + in the file was unsigned, so a VCF->BCF->VCF round-trip would change it + from 0 to 4294967296. + (PR#1476, fixes #1475 and bcftools#1753. Reported by Rodrigo Martin) + +* Various bugs and quirks have been fixed in the filter expression engine, + mostly related to the handling of absent tags, and the is_true flag. + Note that as a result of these fixes, some filter expressions may give + different results: + - Fixed and-expressions including aux tag values which could give an invalid + true result depending on the order of terms. + - The expression `![NM]` is now true if only `NM` does not exist. In + earlier versions it would also report true for tags like `NM:i:0` which + exist but have a value of zero. + - The expression `[X1] != 0` is now false when `X1` does not exist. Earlier + versions would return true for this comparison when the tag was missing. + - NULL values due to missing tags now propagate through string, bitwise + and mathematical operations. Logical operations always treat them as + false. + (PR#1463, fixes samtools#1670. Reported by Gert Hulselmans; + PR#1478, fixes samtools#1677. Reported by johnsonzcode) + +* Fix buffer overrun in bam_plp_insertion_mod. Memory now grows to the proper + size needed for base modification data. + (PR#1430, fixes samtools#1652. Reported by hd2326) + +* Remove limit of returned size from fai_retrieve(). + (PR#1446, fixes samtools#1660. Reported by Shane McCarthy) + +* Cap hts_getline() return value at INT_MAX. Prevents hts_getline() from + returning a negative number (a fail) for very long string length values. + (PR#1448. Thanks to John Marshall) + +* Fix breakend detection and test bcf_set_variant_type(). + (PR#1456, fixes #1455. Thanks to Martin Pollard) + +* Prevent arrays of BCF_BT_NULL values found in BCF files from causing + bcf_fmt_array() to call exit() as the type is unsupported. These are + now tested for and caught by bcf_record_check(), which returns an + error code instead. (PR#1486) + +* Improved detection of fasta and fastq files that have very long comments + following identifiers. (PR#1491, thanks to John Marshall. + Fixes samtools/samtools#1689, reported by cjw85) + +* Fixed a SEGV triggered by giving a SAM file to `samtools import`. + (PR#1492) + Noteworthy changes in release 1.15.1 (7th April 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Security fix: Fixed broken error reporting in the sam_cap_mapq() +* Security fix: Fixed broken error reporting in the sam_prob_realn() function, due to a missing hts_log() parameter. Prior to this fix - it was possible to abuse the log message format string by passing - a specially crafted alignment record to this function. (PR#1406) + (i.e., in HTSlib versions 1.8 to 1.15) it was possible to abuse + the log message format string by passing a specially crafted + alignment record to this function. (PR#1406) * HTSlib now uses libhtscodecs release 1.2.2. This fixes a number of bugs where invalid compressed data could trigger usage of @@ -431,7 +593,7 @@ These are compiler, configuration and makefile based changes. compiler flags. Thanks to John Marshall. (#1187) * Added 'fall through' comments to prevent warnings issued by Clang on - intentional fall through case statements, when building with + intentional fall through case statements, when building with `-Wextra flag`. Thanks to John Marshall. (#1163) * Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work @@ -455,7 +617,7 @@ Bug fixes CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) * Fixed a tabix bug that prevented setting the correct number of lines to be - skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; + skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; fixed #1186) * Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing @@ -719,7 +881,7 @@ Bug fixes * Fixed potential integer overflows in the VCF parser and ensured that the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044, - #1104; latter is CVE-2020-36403 affecting HTSlib versions 1.10 to 1.10.2) + #1104; latter is CVE-2020-36403 affecting all HTSlib versions up to 1.10.2) * Download index files atomically in idx_test_and_fetch(). This prevents corruption when running parallel jobs on S3 files. Thanks to John Marshall. @@ -1375,7 +1537,7 @@ Noteworthy changes in release 1.8 (3rd April 2018) Noteworthy changes in release 1.7 (26th January 2018) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* BAM: HTSlib now supports BAMs which include CIGARs with more than +* BAM: HTSlib now supports BAMs which include CIGARs with more than 65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8). * BCF/VCF: @@ -1393,13 +1555,13 @@ Noteworthy changes in release 1.7 (26th January 2018) (#651, #653; reported by Imran Haque and @egafni via pysam). * Multi-region iterator: The new structure takes a list of regions and - iterates over all, deduplicating reads in the process, and producing a - full list of file offset intervals. This is usually much faster than + iterates over all, deduplicating reads in the process, and producing a + full list of file offset intervals. This is usually much faster than repeatedly using the old single-region iterator on a series of regions. * Curl improvements: - Add Bearer token support via HTS_AUTH_LOCATION env (#600). - - Use CURL_CA_BUNDLE environment variable to override the CA (#622; + - Use CURL_CA_BUNDLE environment variable to override the CA (#622; thanks to Garret Kelly & David Alexander). - Speed up (removal of excessive waiting) for both http(s) and ftp. - Avoid repeatedly reconnecting by removal of unnecessary seeks. @@ -1408,7 +1570,7 @@ Noteworthy changes in release 1.7 (26th January 2018) * BGZF block caching, if enabled, now performs far better (#629; reported by Ram Yalamanchili). -* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas +* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas Hickman). * Tidied up the drand48 support (intended for systems that do not @@ -1506,7 +1668,7 @@ Release 1.4 (13 March 2017) * HTSlib now links against libbz2 and liblzma by default. To remove these dependencies, run configure with options --disable-bz2 and --disable-lzma, - but note that this may make some CRAM files produced elsewhere unreadable. + but note that this may make some CRAM files produced elsewhere unreadable. * Added a thread pool interface and replaced the bgzf multi-threading code to use this pool. BAM and CRAM decoding is now multi-threaded diff --git a/bgzf.c b/bgzf.c index e72ed566d..a969b1567 100644 --- a/bgzf.c +++ b/bgzf.c @@ -574,6 +574,8 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le } else { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default // NB levels go up to 12 here. + int lvl_map[] = {0,1,2,3,5,6,7,8,10,12}; + level = lvl_map[level>9 ?9 :level]; struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) return -1; diff --git a/bgzip.1 b/bgzip.1 index c2a319156..1ada36630 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,4 +1,4 @@ -.TH bgzip 1 "7 April 2022" "htslib-1.15.1" "Bioinformatics tools" +.TH bgzip 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility diff --git a/config.mk.in b/config.mk.in index f8decf0a2..82af49850 100644 --- a/config.mk.in +++ b/config.mk.in @@ -112,3 +112,9 @@ LDFLAGS += $(noplugin_LDFLAGS) LIBS += $(noplugin_LIBS) endif + +# Extra CFLAGS for specific files +HTS_CFLAGS_AVX2 = @hts_cflags_avx2@ +HTS_CFLAGS_AVX512 = @hts_cflags_avx512@ +HTS_CFLAGS_SSE4 = @hts_cflags_sse4@ +HTS_HAVE_NEON = @hts_have_neon@ diff --git a/configure.ac b/configure.ac index 1216ecc21..7d40948a6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2021 Genome Research Ltd. +# Copyright (C) 2015-2022 Genome Research Ltd. # # Author: John Marshall # @@ -30,6 +30,7 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) +m4_include([m4/ax_check_compile_flag.m4]) m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) @@ -69,6 +70,80 @@ dnl Flags to treat warnings as errors. These need to be applied to CFLAGS dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) +dnl Check for various compiler flags to enable SIMD features +dnl Options for rANS32x16 sse4.1 version +AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [ + hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1" + AC_SUBST([hts_cflags_sse4]) + AC_DEFINE([HAVE_SSSE3],1, + [Defined to 1 if the compiler can issue SSSE3 instructions.]) + AC_DEFINE([HAVE_POPCNT],1, + [Defined to 1 if the compiler can issue popcnt instructions.]) + AC_DEFINE([HAVE_SSE4_1],1, + [Defined to 1 if the compiler can issue SSE4.1 instructions.]) +dnl Propagate HTSlib's unaligned access preference to htscodecs + AH_VERBATIM([UBSAN],[ +/* Prevent unaligned access in htscodecs SSE4 rANS codec */ +#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0 +#undef UBSAN +#endif + ]) + AC_DEFINE([UBSAN],1,[]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + unsigned int i = _mm_popcnt_u32(1); + __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + b = _mm_shuffle_epi8(a, c); + return *((char *) &b); + ]])]) + +dnl Options for rANS32x16 avx2 version +AX_CHECK_COMPILE_FLAG([-mavx2], [ + hts_cflags_avx2="-mavx2" + AC_SUBST([hts_cflags_avx2]) + AC_DEFINE([HAVE_AVX2],1, + [Defined to 1 if the compiler can issue AVX2 instructions.]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; + ]])]) + +dnl Options for rANS32x16 avx512 version +AX_CHECK_COMPILE_FLAG([-mavx512f], [ + hts_cflags_avx512="-mavx512f" + AC_SUBST([hts_cflags_avx512]) + AC_DEFINE([HAVE_AVX512],1, + [Defined to 1 if the compiler can issue AVX512 instructions.]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); + ]])]) + +dnl Detect ARM Neon availability +AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [ + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([[ + #include "arm_neon.h" + ]], [[ + int32x4_t a = vdupq_n_s32(1); + int32x4_t b = vaddq_s32(a, a); + return *((char *) &b); + ]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])]) +if test "$hts_cv_have_neon" = yes; then + hts_have_neon=yes + AC_SUBST([hts_have_neon]) +fi + + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check dnl for pkg-config... diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 9f112863e..33e1b5bf8 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2030,10 +2030,10 @@ static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) { int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz); if (!(b->data = malloc(out_sz))) return -1; - rle_decode(lit_dat, lit_sz, - len_dat+nb, len_sz-nb, - rle_syms, rle_nsyms, - b->data, &out_sz); + hts_rle_decode(lit_dat, lit_sz, + len_dat+nb, len_sz-nb, + rle_syms, rle_nsyms, + b->data, &out_sz); b->uncomp_size = out_sz; return 0; @@ -2200,10 +2200,10 @@ int cram_xrle_encode_flush(cram_codec *c) { int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size); - out_lit = rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, - out_len+nb, &out_len_size, - rle_syms, &rle_nsyms, - NULL, &out_lit_size); + out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, + out_len+nb, &out_len_size, + rle_syms, &rle_nsyms, + NULL, &out_lit_size); out_len_size += nb; diff --git a/cram/cram_decode.c b/cram/cram_decode.c index b352fc633..1f8d60f12 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2037,12 +2037,30 @@ static int cram_decode_aux(cram_fd *fd, m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); if (!m) return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); if (!m->codec) return -1; r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); if (r) break; cr->aux_size += out_sz + 3; + + // cF CRAM flags. + if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) { + // Remove cF tag + uint8_t cF = BLOCK_END(s->aux_blk)[-1]; + BLOCK_SIZE(s->aux_blk) -= out_sz+3; + cr->aux_size -= out_sz+3; + + // bit 1 => don't auto-decode MD. + // Pretend MD is present verbatim, so we don't auto-generate + if ((cF & 1) && has_MD && *has_MD == 0) + *has_MD = 1; + + // bit 1 => don't auto-decode NM + if ((cF & 2) && has_NM && *has_NM == 0) + *has_NM = 1; + } } } @@ -2423,10 +2441,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((!s->ref && s->hdr->ref_base_id < 0) || memcmp(digest, s->hdr->md5, 16) != 0) { char M[33]; - hts_log_error("MD5 checksum reference mismatch at #%d:%d-%d", - ref_id, s->ref_start, s->ref_end); - hts_log_error("CRAM: %s", md5_print(s->hdr->md5, M)); - hts_log_error("Ref : %s", md5_print(digest, M)); + const char *rname = sam_hdr_tid2name(sh, ref_id); + if (!rname) rname="?"; // cannot happen normally + hts_log_error("MD5 checksum reference mismatch at %s:%d-%d", + rname, s->ref_start, s->ref_end); + hts_log_error("CRAM : %s", md5_print(s->hdr->md5, M)); + hts_log_error("Ref : %s", md5_print(digest, M)); + kstring_t ks = KS_INITIALIZE; + if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0) + hts_log_error("@SQ M5: %s", ks.s); + hts_log_error("Please check the reference given is correct"); + ks_free(&ks); return -1; } } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index d35643a92..1ba1988f4 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -42,12 +42,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram.h" #include "os.h" #include "../sam_internal.h" // for nibble2base #include "../htslib/hts.h" #include "../htslib/hts_endian.h" +#include "../textutils_internal.h" KHASH_MAP_INIT_STR(m_s2u64, uint64_t) @@ -58,7 +60,8 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD); + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref); /* * Returns index of val into key. @@ -78,7 +81,8 @@ static int sub_idx(char *key, char val) { * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h) { + cram_block_compression_hdr *h, + int embed_ref) { cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); int i, mc, r = 0; @@ -158,7 +162,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, kh_val(h->preservation_map, k).i = h->qs_seq_orient; } - if (fd->no_ref || fd->embed_ref) { + if (fd->no_ref || embed_ref>0) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); if (-1 == r) return NULL; @@ -1075,14 +1079,12 @@ static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { * -1 on failure */ static int cram_encode_slice(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h, cram_slice *s) { + cram_block_compression_hdr *h, cram_slice *s, + int embed_ref) { int rec, r = 0; int64_t last_pos; - int embed_ref; enum cram_DS_ID id; - embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; - /* * Slice external blocks: * ID 0 => base calls (insertions, soft-clip) @@ -1095,7 +1097,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, */ /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0 + s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0 ? DS_ref : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1); s->hdr->record_counter = c->num_records + c->record_counter; @@ -1123,7 +1125,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, } // Embedded reference - if (embed_ref) { + if (embed_ref>0) { if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) return -1; s->ref_id = DS_ref; // needed? @@ -1400,6 +1402,292 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, // CRAM version >= 3.1 #define CRAM_ge31(v) ((v) >= 0x301) +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Ensure ref and hist are large enough. +static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, + hts_pos_t ref_start, hts_pos_t *ref_end) { + if (pos < ref_start) + return -1; + if (pos < *ref_end) + return 0; + + // realloc + hts_pos_t old_end = *ref_end ? *ref_end : ref_start; + hts_pos_t new_end = *ref_end = ref_start + 1000 + (pos-ref_start)*1.5; + + char *tmp = realloc(*ref, *ref_end-ref_start); + if (!tmp) + return -1; + *ref = tmp; + + uint32_t (*tmp5)[5] = realloc(**hist, + (*ref_end - ref_start)*sizeof(**hist)); + if (!tmp5) + return -1; + *hist = tmp5; + *ref_end = new_end; + + // initialise + old_end -= ref_start; + new_end -= ref_start; + memset(&(*ref)[old_end], 0, new_end-old_end); + memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist)); + + return 0; +} + +// Walk through MD + seq to generate ref +static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end, + const uint8_t *MD) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + + int iseq = 0, next_op; + hts_pos_t iref = b->core.pos - ref_start; + + // Skip INS, REF_SKIP, *CLIP, PAD. and BACK. + static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && *MD) { + if (isdigit(*MD)) { + // match + int overflow = 0; + int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); + if (overflow || + extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + while (iseq < b->core.l_qseq && len) { + // rewrite to have internal loops? + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + hts_log_info("MD:Z and CIGAR are incompatible for " + "record %s", bam_get_qname(b)); + return -1; + } + + // Short-cut loop over same cigar op for efficiency + cig_len++; + do { + cig_len--; + (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)]; + iseq++; + len--; + } while (cig_len && iseq < b->core.l_qseq && len); + } + if (len > 0) + return -1; // MD is longer than seq + } else if (*MD == '^') { + // deletion + MD++; + while (isalpha(*MD)) { + if (extend_ref(ref, hist, iref+ref_start, ref_start, + ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + } + } else { + // substitution + if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + iseq++; + } + } + + return 1; +} + +// Append a sequence to a ref/consensus structure. +// We maintain both an absolute refefence (ACGTN where MD:Z is +// present) and a 5-way frequency array for when no MD:Z is known. +// We then subsequently convert the 5-way frequencies to a consensus +// ref in a second pass. +// +// Returns >=0 on success, +// -1 on failure (eg inconsistent data) +static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end) { + const uint8_t *MD = bam_aux_get(b, "MD"); + int ret = 0; + if (MD && *MD == 'Z') { + // We can use MD to directly compute the reference + int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1); + + if (ret > 0) + return ret; + } + + // Otherwise we just use SEQ+CIGAR and build a consensus which we later + // turn into a fake reference + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t i, j; + hts_pos_t iseq = 0, iref = b->core.pos - ref_start; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < ncigar; i++) { + switch (bam_cigar_op(cigar[i])) { + case BAM_CSOFT_CLIP: + case BAM_CINS: + iseq += bam_cigar_oplen(cigar[i]); + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + int len = bam_cigar_oplen(cigar[i]); + // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4 + static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; + + if (extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + if (iseq + len <= b->core.l_qseq) { + // Nullify failed MD:Z if appropriate + if (ret < 0) + memset(&(*ref)[iref], 0, len); + + for (j = 0; j < len; j++, iref++, iseq++) + (*hist)[iref][L16[bam_seqi(seq, iseq)]]++; + } else { + // Probably a 2ndary read with seq "*" + iseq += len; + iref += len; + } + break; + } + + case BAM_CDEL: + case BAM_CREF_SKIP: + iref += bam_cigar_oplen(cigar[i]); + } + } + + return 1; +} + +// Automatically generates the reference and stashed it in c->ref, also +// setting c->ref_start and c->ref_end. +// +// If we have MD:Z tags then we use them to directly infer the reference, +// along with SEQ + CIGAR. Otherwise we use SEQ/CIGAR only to build up +// a consensus and then assume the reference as the majority rule. +// +// In this latter scenario we need to be wary of auto-generating MD and NM +// during decode, but that's handled elsewhere via an additional aux tag. +// +// Returns 0 on success, +// -1 on failure +static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { + // TODO: if we can find an external reference then use it, even if the + // user told us to do embed_ref=2. + char *ref = NULL; + uint32_t (*hist)[5] = NULL; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0; + + // initial allocation + if (extend_ref(&ref, &hist, + c->bams[r1 + s->hdr->num_records-1]->core.pos + + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, + ref_start, &ref_end) < 0) + return -1; + + // Add each bam file to the reference/consensus arrays + int r2; + hts_pos_t last_pos = -1; + for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { + if (c->bams[r1]->core.pos < last_pos) { + hts_log_error("Cannot build reference with unsorted data"); + goto err; + } + last_pos = c->bams[r1]->core.pos; + if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0) + goto err; + } + + // Compute the consensus + hts_pos_t i; + for (i = 0; i < ref_end-ref_start; i++) { + if (!ref[i]) { + int max_v = 0, max_j = 4, j; + for (j = 0; j < 4; j++) + // don't call N (j==4) unless no coverage + if (max_v < hist[i][j]) + max_v = hist[i][j], max_j = j; + ref[i] = "ACGTN"[max_j]; + } + } + free(hist); + + // Put the reference in place so it appears to be an external + // ref file. + c->ref = ref; + c->ref_start = ref_start+1; + c->ref_end = ref_end+1; + + return 0; + + err: + free(ref); + free(hist); + return -1; +} + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1410,7 +1698,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { cram_block_compression_hdr *h = c->comp_hdr; cram_block *c_hdr; int multi_ref = 0; - int r1, r2, sn, nref; + int r1, r2, sn, nref, embed_ref; spare_bams *spares; if (CRAM_MAJOR_VERS(fd->version) == 1) @@ -1422,6 +1710,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; + embed_ref = fd->embed_ref; pthread_mutex_unlock(&fd->ref_lock); if (!fd->no_ref && c->refs_used) { @@ -1438,19 +1727,41 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { goto_err; bam_seq_t *b = c->bams[0]; - char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); - if (!ref && bam_ref(b) >= 0) { - hts_log_error("Failed to load reference #%d", bam_ref(b)); - return -1; - } - if ((c->ref_id = bam_ref(b)) >= 0) { - c->ref_seq_id = c->ref_id; - c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; - c->ref_start = 1; - c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + if (embed_ref <= 1) { + char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); + if (!ref && bam_ref(b) >= 0) { + if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) { + hts_log_error("Failed to load reference #%d", bam_ref(b)); + return -1; + } + hts_log_warning("Failed to load reference #%d", bam_ref(b)); + hts_log_warning("Enabling embed_ref=2 mode to auto-generate" + " reference"); + if (embed_ref <= 0) + hts_log_warning("NOTE: the CRAM file will be bigger than" + " using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + embed_ref = fd->embed_ref = 2; + pthread_mutex_unlock(&fd->ref_lock); + goto auto_ref; + } + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_seq_id = c->ref_id; + c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; + c->ref_start = 1; + c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + } } else { - c->ref_seq_id = c->ref_id; // FIXME remove one var! + auto_ref: + // Auto-embed ref. + // This starts as 'N' and is amended on-the-fly as we go + // based on MD:Z tags. + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_free = 1; + c->ref = NULL; + } } + c->ref_seq_id = c->ref_id; } else { c->ref_id = bam_ref(c->bams[0]); cram_ref_incr(fd->refs, c->ref_id); @@ -1477,6 +1788,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // is done within process_one_read(). kstring_t MD = {0}; + // Embed consensus / MD-generated ref + if (embed_ref == 2) { + if (cram_generate_reference(c, s, r1) < 0) { + hts_log_error("Failed to build reference"); + return -1; + } + } + // Iterate through records creating the cram blocks for some // fields and just gathering stats for others. for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { @@ -1504,7 +1823,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) { + if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) { free(MD.s); return -1; } @@ -1515,6 +1834,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (last_base < cr->aend) last_base = cr->aend; } + free(MD.s); // Process_one_read doesn't add read names as it can change @@ -1891,7 +2211,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { for (i = 0; i < c->curr_slice; i++) { hts_log_info("Encode slice %d", i); - if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) + int local_embed_ref = + embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0; + if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0) return -1; } @@ -1906,7 +2228,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); - if (!(c_hdr = cram_encode_compression_header(fd, c, h))) + if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref))) return -1; } @@ -2212,15 +2534,17 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, * Encodes auxiliary data. Largely duplicated from above, but done so to * keep it simple and avoid a myriad of version ifs. * - * Returns the read-group parsed out of the BAM aux fields on success + * Returns the RG header line pointed to by the BAM aux fields on success, * NULL on failure or no rg present, also sets "*err" to non-zero */ -static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr, - int verbatim_NM, int verbatim_MD, - int NM, kstring_t *MD, - int *err) { - char *aux, *orig, *rg = NULL; +static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, + cram_container *c, + cram_slice *s, cram_record *cr, + int verbatim_NM, int verbatim_MD, + int NM, kstring_t *MD, int cf_tag, + int *err) { + char *aux, *orig; + sam_hrec_rg_t *brg = NULL; int aux_size = bam_get_l_aux(b); cram_block *td_b = c->comp_hdr->TD_blk; int TD_blk_size = BLOCK_SIZE(td_b), new; @@ -2231,17 +2555,41 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, orig = aux = (char *)bam_aux(b); + + // cF:i => Extra CRAM bit flags. + // 1: Don't auto-decode MD (may be invalid) + // 2: Don't auto-decode NM (may be invalid) + if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) { + // Temporary copy of aux so we can ammend it. + aux = malloc(aux_size+4); + if (!aux) + return NULL; + + memcpy(aux, orig, aux_size); + aux[aux_size++] = 'c'; + aux[aux_size++] = 'F'; + aux[aux_size++] = 'C'; + aux[aux_size++] = cf_tag; + orig = aux; + } + // Copy aux keys to td_b and aux values to slice aux blocks while (aux - orig < aux_size && aux[0] != 0) { int r; // RG:Z if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - if (CRAM_MAJOR_VERS(fd->version) >= 4) - BLOCK_APPEND(td_b, "RG*", 3); - continue; + char *rg = &aux[3]; + brg = sam_hrecs_find_rg(fd->header->hrecs, rg); + if (brg) { + while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "RG*", 3); + continue; + } else { + // RG:Z tag will be stored verbatim + hts_log_warning("Missing @RG header for RG \"%s\"", rg); + } } // MD:Z @@ -2593,11 +2941,17 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0) goto block_err; + if (orig != (char *)bam_aux(b)) + free(orig); + if (err) *err = 0; - return rg; + + return brg; err: block_err: + if (orig != (char *)bam_aux(b)) + free(orig); return NULL; } @@ -2722,6 +3076,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { return c; } + /* * Converts a single bam record into a cram record. * Possibly used within a thread. @@ -2731,9 +3086,10 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { */ static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD) { + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref) { int i, fake_qual = -1, NM = 0; - char *cp, *rg; + char *cp; char *ref, *seq, *qual; // Any places with N in seq and/or reference can lead to ambiguous @@ -2746,16 +3102,24 @@ static int process_one_read(cram_fd *fd, cram_container *c, // FIXME: multi-ref containers - ref = c->ref; cr->flags = bam_flag(b); cr->len = bam_seq_len(b); - if (!bam_aux_get(b, "MD")) + uint8_t *md; + if (!(md = bam_aux_get(b, "MD"))) MD = NULL; else MD->l = 0; + int cf_tag = 0; + + if (embed_ref == 2) { + cf_tag = MD ? 0 : 1; // No MD + cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM + } + //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); + ref = c->ref ? c->ref - (c->ref_start-1) : NULL; cr->ref_id = bam_ref(b); if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0) goto block_err; @@ -3069,14 +3433,15 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + sam_hrec_rg_t *brg = + cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, + cf_tag, &err); if (err) goto block_err; /* Read group, identified earlier */ - if (rg) { - sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg); - cr->rg = brg ? brg->id : -1; + if (brg) { + cr->rg = brg->id; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); if (!brg) goto block_err; @@ -3381,9 +3746,12 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * The multi_seq var here refers to our intention for the next slice. * This slice has already been encoded so we output as-is. */ + pthread_mutex_lock(&fd->ref_lock); + int embed_ref = fd->embed_ref; + pthread_mutex_unlock(&fd->ref_lock); if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && - !fd->embed_ref) { + embed_ref<=0) { if (!c->multi_seq) hts_log_info("Multi-ref enabled for next container"); multi_seq = 1; @@ -3441,8 +3809,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->slice_rec = c->curr_rec; // Have we seen this reference before? - if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && - !fd->unsorted && multi_seq) { + if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && + embed_ref<=0 && !fd->unsorted && multi_seq) { if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); diff --git a/cram/cram_encode.h b/cram/cram_encode.h index 7cccae9af..03b8054e8 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -74,7 +74,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b); * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h); + cram_block_compression_hdr *h, + int embed_ref); /*! INTERNAL: * Encodes a slice compression header. diff --git a/cram/cram_external.c b/cram/cram_external.c index 314826932..e88ff838b 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -188,6 +188,19 @@ int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) { return hdr->num_blocks; } +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) { + return h->ref_base_id; +} + +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span) { + if (refid) + *refid = h->ref_seq_id; + if (start) + *start = h->ref_seq_start; + if (span) + *span = h->ref_seq_span; +} /* *----------------------------------------------------------------------------- @@ -318,7 +331,7 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, return -1; if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0) return -1; - n_blk = cram_encode_compression_header(in, c, ch); + n_blk = cram_encode_compression_header(in, c, ch, in->embed_ref); cram_free_compression_header(ch); /* diff --git a/cram/cram_io.c b/cram/cram_io.c index c9dcb5014..5d01e1318 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1722,7 +1722,7 @@ int cram_uncompress_block(cram_block *b) { case TOK3: { uint32_t out_len; - uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); if (!cp) return -1; b->orig_method = TOK3; @@ -1875,7 +1875,7 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int lev = level; if (method == TOK3 && lev > 3) lev = 3; - uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); + uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); *out_size = out_len; return (char *)cp; } @@ -3561,7 +3561,7 @@ int cram_load_reference(cram_fd *fd, char *fn) { if (fn) { fd->refs = refs_load_fai(fd->refs, fn, - !(fd->embed_ref && fd->mode == 'r')); + !(fd->embed_ref>0 && fd->mode == 'r')); fn = fd->refs ? fd->refs->fn : NULL; if (!fn) ret = -1; @@ -3639,6 +3639,7 @@ cram_container *cram_new_container(int nrec, int nslice) { if (!(c->tags_used = kh_init(m_tagmap))) goto err; c->refs_used = 0; + c->ref_free = 0; return c; @@ -3711,6 +3712,9 @@ void cram_free_container(cram_container *c) { kh_destroy(m_tagmap, c->tags_used); } + if (c->ref_free) + free(c->ref); + free(c); } @@ -4820,7 +4824,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { } /* Fix M5 strings */ - if (fd->refs && !fd->no_ref) { + if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { int i; for (i = 0; i < hdr->hrecs->nref; i++) { sam_hrec_type_t *ty; @@ -4841,11 +4845,25 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } rlen = fd->refs->ref_id[i]->length; - if (!(md5 = hts_md5_init())) - return -1; ref = cram_get_ref(fd, i, 1, rlen); - if (NULL == ref) return -1; + if (NULL == ref) { + if (fd->embed_ref == -1) { + // auto embed-ref + hts_log_warning("No M5 tags present and could not " + "find reference"); + hts_log_warning("Enabling embed_ref=2 option"); + hts_log_warning("NOTE: the CRAM file will be bigger " + "than using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + fd->embed_ref = 2; + pthread_mutex_unlock(&fd->ref_lock); + break; + } + return -1; + } rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ + if (!(md5 = hts_md5_init())) + return -1; hts_md5_update(md5, ref, rlen); hts_md5_final(buf, md5); hts_md5_destroy(md5); @@ -5245,7 +5263,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->seqs_per_slice = SEQS_PER_SLICE; fd->bases_per_slice = BASES_PER_SLICE; fd->slices_per_container = SLICE_PER_CNT; - fd->embed_ref = 0; + fd->embed_ref = -1; // automatic selection fd->no_ref = 0; fd->ap_delta = 0; fd->ignore_md5 = 0; @@ -5392,7 +5410,7 @@ int cram_write_eof_block(cram_fd *fd) { // block CRC cram_block_compression_hdr ch; memset(&ch, 0, sizeof(ch)); - c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch); + c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); c.length = c.comp_hdr_block->byte // Landmark[0] + 5 // block struct diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ce27bc1a4..16739c2c6 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -473,6 +473,7 @@ struct cram_container { uint64_t s_num_bases; // number of bases in this slice uint32_t n_mapped; // Number of mapped reads + int ref_free; // whether 'ref' is owned by us and must be freed. }; /* diff --git a/faidx.c b/faidx.c index 4b25d3918..f3be5e57c 100644 --- a/faidx.c +++ b/faidx.c @@ -731,7 +731,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } s[l] = '\0'; - *len = l < INT_MAX ? l : INT_MAX; + *len = l; return s; } @@ -784,7 +784,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetch64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -803,7 +803,7 @@ char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetchqual64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -876,7 +876,7 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p { hts_pos_t len64; char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -897,7 +897,7 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int { hts_pos_t len64; char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } diff --git a/hfile_internal.h b/hfile_internal.h index 70cc99c57..2e365ae7d 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -90,11 +90,13 @@ struct hFILE_backend { /* May be called by hopen_*() functions to decode a fopen()-style mode into open(2)-style flags. */ +HTSLIB_EXPORT int hfile_oflags(const char *mode); /* Must be called by hopen_*() functions to allocate the hFILE struct and set up its base. Capacity is a suggested buffer size (e.g., via fstat(2)) or 0 for a default-sized buffer. */ +HTSLIB_EXPORT hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity); /* Alternative to hfile_init() for in-memory backends for which the base @@ -107,6 +109,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode, /* May be called by hopen_*() functions to undo the effects of hfile_init() in the event opening the stream subsequently fails. (This is safe to use even if fp is NULL. This takes care to preserve errno.) */ +HTSLIB_EXPORT void hfile_destroy(hFILE *fp); @@ -138,10 +141,13 @@ struct hFILE_scheme_handler { }; /* May be used as an isremote() function in simple cases. */ +HTSLIB_EXPORT extern int hfile_always_local (const char *fname); +HTSLIB_EXPORT extern int hfile_always_remote(const char *fname); /* Should be called by plugins for each URL scheme they wish to handle. */ +HTSLIB_EXPORT void hfile_add_scheme_handler(const char *scheme, const struct hFILE_scheme_handler *handler); diff --git a/hfile_s3.c b/hfile_s3.c index e8e505e2a..ce83875c9 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2021 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2022 Genome Research Ltd. Author: John Marshall @@ -40,6 +40,7 @@ DEALINGS IN THE SOFTWARE. */ #endif #include "htslib/hts.h" // for hts_version() and hts_verbose #include "htslib/kstring.h" +#include "hts_time_funcs.h" typedef struct s3_auth_data { kstring_t id; @@ -49,6 +50,8 @@ typedef struct s3_auth_data { kstring_t canonical_query_string; kstring_t user_query_string; kstring_t host; + kstring_t profile; + time_t creds_expiry_time; char *bucket; kstring_t auth_hdr; time_t auth_time; @@ -57,11 +60,12 @@ typedef struct s3_auth_data { char date_short[9]; kstring_t date_html; char mode; - char *headers[4]; + char *headers[5]; int refcount; } s3_auth_data; -#define AUTH_LIFETIME 60 +#define AUTH_LIFETIME 60 // Regenerate auth headers if older than this +#define CREDENTIAL_LIFETIME 60 // Seconds before expiry to reread credentials #if defined HAVE_COMMONCRYPTO @@ -235,7 +239,10 @@ static void parse_ini(const char *fname, const char *section, ...) va_start(args, section); while ((akey = va_arg(args, const char *)) != NULL) { kstring_t *avar = va_arg(args, kstring_t *); - if (strcmp(key, akey) == 0) { kputs(value, avar); break; } + if (strcmp(key, akey) == 0) { + avar->l = 0; + kputs(value, avar); + break; } } va_end(args); } @@ -270,17 +277,37 @@ static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret) static int copy_auth_headers(s3_auth_data *ad, char ***hdrs) { char **hdr = &ad->headers[0]; + int idx = 0; *hdrs = hdr; - *hdr = strdup(ad->date); - if (!*hdr) return -1; - hdr++; + + hdr[idx] = strdup(ad->date); + if (!hdr[idx]) return -1; + idx++; + + if (ad->token.l) { + kstring_t token_hdr = KS_INITIALIZE; + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + if (token_hdr.s) { + hdr[idx++] = token_hdr.s; + } else { + goto fail; + } + } + if (ad->auth_hdr.l) { - *hdr = strdup(ad->auth_hdr.s); - if (!*hdr) { free(ad->headers[0]); return -1; } - hdr++; + hdr[idx] = strdup(ad->auth_hdr.s); + if (!hdr[idx]) goto fail; + idx++; } - *hdr = NULL; + + hdr[idx] = NULL; return 0; + + fail: + for (--idx; idx >= 0; --idx) + free(hdr[idx]); + return -1; } static void free_auth_data(s3_auth_data *ad) { @@ -288,6 +315,7 @@ static void free_auth_data(s3_auth_data *ad) { --ad->refcount; return; } + free(ad->profile.s); free(ad->id.s); free(ad->token.s); free(ad->secret.s); @@ -301,6 +329,67 @@ static void free_auth_data(s3_auth_data *ad) { free(ad); } +static time_t parse_rfc3339_date(kstring_t *datetime) +{ + int offset = 0; + time_t when; + int num; + char should_be_t = '\0', timezone[10] = { '\0' }; + unsigned int year, mon, day, hour, min, sec; + + if (!datetime->s) + return 0; + + // It should be possible to do this with strptime(), but it seems + // to not get on with our feature definitions. + num = sscanf(datetime->s, "%4u-%2u-%2u%c%2u:%2u:%2u%9s", + &year, &mon, &day, &should_be_t, &hour, &min, &sec, timezone); + if (num < 8) + return 0; + if (should_be_t != 'T' && should_be_t != 't' && should_be_t != ' ') + return 0; + struct tm parsed = { sec, min, hour, day, mon - 1, year - 1900, 0, 0, 0 }; + + switch (timezone[0]) { + case 'Z': + case 'z': + case '\0': + break; + case '+': + case '-': { + unsigned hr_off, min_off; + if (sscanf(timezone + 1, "%2u:%2u", &hr_off, &min_off)) { + if (hr_off < 24 && min_off <= 60) { + offset = ((hr_off * 60 + min_off) + * (timezone[0] == '+' ? -60 : 60)); + } + } + break; + } + default: + return 0; + } + + when = hts_time_gm(&parsed); + return when >= 0 ? when + offset : 0; +} + +static void refresh_auth_data(s3_auth_data *ad) { + // Basically a copy of the AWS_SHARED_CREDENTIALS_FILE part of + // setup_auth_data(), but this only reads the authorisation parts. + const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); + kstring_t expiry_time = KS_INITIALIZE; + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, + "aws_access_key_id", &ad->id, + "aws_secret_access_key", &ad->secret, + "aws_session_token", &ad->token, + "expiry_time", &expiry_time); + if (expiry_time.l) { + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } + ks_free(&expiry_time); +} + static int auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; @@ -320,7 +409,10 @@ static int auth_header_callback(void *ctx, char ***hdrs) { return 0; } - if (now - ad->auth_time < AUTH_LIFETIME) { + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } else if (now - ad->auth_time < AUTH_LIFETIME) { // Last auth string should still be valid *hdrs = NULL; return 0; @@ -499,7 +591,6 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, s3_auth_data *ad = calloc(1, sizeof(*ad)); const char *bucket, *path; char *escaped = NULL; - kstring_t profile = { 0, 0, NULL }; size_t url_path_pos; ptrdiff_t bucket_len; int is_https = 1, dns_compliant; @@ -532,7 +623,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if (*path == '@') { const char *colon = strpbrk(bucket, ":@"); if (*colon != ':') { - urldecode_kput(bucket, colon - bucket, &profile); + urldecode_kput(bucket, colon - bucket, &ad->profile); } else { const char *colon2 = strpbrk(&colon[1], ":@"); @@ -554,9 +645,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if ((v = getenv("AWS_DEFAULT_REGION")) != NULL) kputs(v, &ad->region); if ((v = getenv("HTS_S3_HOST")) != NULL) kputs(v, &ad->host); - if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); - else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); - else kputs("default", &profile); + if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &ad->profile); + else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &ad->profile); + else kputs("default", &ad->profile); if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) { if (strcasecmp(v, "virtual") == 0) { @@ -569,13 +660,15 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if (ad->id.l == 0) { kstring_t url_style = KS_INITIALIZE; + kstring_t expiry_time = KS_INITIALIZE; const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); - parse_ini(v? v : "~/.aws/credentials", profile.s, + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, "region", &ad->region, "addressing_style", &url_style, + "expiry_time", &expiry_time, NULL); if (url_style.l) { @@ -587,14 +680,23 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, address_style = s3_auto; } } + if (expiry_time.l) { + // Not a real part of the AWS configuration file, but it allows + // support for short-term credentials like those for the IAM + // service. The botocore library uses the key "expiry_time" + // internally for this purpose. + // See https://github.com/boto/botocore/blob/develop/botocore/credentials.py + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } ks_free(&url_style); + ks_free(&expiry_time); } if (ad->id.l == 0) { kstring_t url_style = KS_INITIALIZE; const char *v = getenv("HTS_S3_S3CFG"); - parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, + parse_ini(v? v : "~/.s3cfg", ad->profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &ad->host, "bucket_location", &ad->region, @@ -699,13 +801,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, *query_start = 0; } - free(profile.s); free(escaped); return ad; error: - free(profile.s); free(escaped); free_auth_data(ad); return NULL; @@ -713,23 +813,13 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) { - char *header_list[4], **header = header_list; - kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; s3_auth_data *ad = setup_auth_data(s3url, mode, 2, &url); if (!ad) return NULL; - if (ad->token.l > 0) { - kputs("X-Amz-Security-Token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -738,12 +828,10 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) if (!fp) goto fail; free(url.s); - free(token_hdr.s); return fp; fail: free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; } @@ -895,9 +983,8 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten } -static int update_time(s3_auth_data *ad) { +static int update_time(s3_auth_data *ad, time_t now) { int ret = -1; - time_t now = time(NULL); #ifdef HAVE_GMTIME_R struct tm tm_buffer; struct tm *tm = gmtime_r(&now, &tm_buffer); @@ -988,6 +1075,7 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co kstring_t *token, int uqs) { s3_auth_data *ad = (s3_auth_data *)auth; char content_hash[HASH_LENGTH_SHA256]; + time_t now; if (request == NULL) { // signal to free auth data @@ -995,9 +1083,15 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } if (content) { hash_string(content->s, content->l, content_hash); @@ -1045,19 +1139,29 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co static int v4_auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; char content_hash[HASH_LENGTH_SHA256]; - kstring_t content = {0, 0, NULL}; - kstring_t authorisation = {0, 0, NULL}; + kstring_t content = KS_INITIALIZE; + kstring_t authorisation = KS_INITIALIZE; + kstring_t token_hdr = KS_INITIALIZE; char *date_html = NULL; + time_t now; + int idx; if (!hdrs) { // Closing connection free_auth_data(ad); return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } + if (!ad->id.l || !ad->secret.l) { return copy_auth_headers(ad, hdrs); } @@ -1083,18 +1187,27 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { ksprintf(&content, "x-amz-content-sha256: %s", content_hash); date_html = strdup(ad->date_html.s); + if (ad->token.l > 0) { + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + } + if (content.l == 0 || date_html == NULL) { ksfree(&authorisation); ksfree(&content); + ksfree(&token_hdr); free(date_html); return -1; } *hdrs = &ad->headers[0]; - ad->headers[0] = ks_release(&authorisation); - ad->headers[1] = date_html; - ad->headers[2] = ks_release(&content); - ad->headers[3] = NULL; + idx = 0; + ad->headers[idx++] = ks_release(&authorisation); + ad->headers[idx++] = date_html; + ad->headers[idx++] = ks_release(&content); + if (token_hdr.s) + ad->headers[idx++] = ks_release(&token_hdr); + ad->headers[idx++] = NULL; return 0; } @@ -1167,9 +1280,7 @@ static int http_status_errno(int status) static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; - char *header_list[4], **header = header_list; s3_auth_data *ad = setup_auth_data(s3url, mode, 4, &url); hFILE *fp = NULL; @@ -1180,14 +1291,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (ad->mode == 'r') { long http_response = 0; - if (ad->token.l > 0) { - kputs("x-amz-security-token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1204,7 +1308,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { goto error; } hclose_abruptly(fp); - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1237,7 +1341,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { } free(url.s); - free(token_hdr.s); return fp; @@ -1245,7 +1348,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (fp) hclose_abruptly(fp); free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; diff --git a/hfile_s3_write.c b/hfile_s3_write.c index eec56696b..d54945839 100644 --- a/hfile_s3_write.c +++ b/hfile_s3_write.c @@ -321,7 +321,7 @@ static int complete_upload(hFILE_s3_write *fp, kstring_t *resp) { curl_easy_reset(fp->curl); curl_easy_setopt(fp->curl, CURLOPT_POST, 1L); curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, fp->completion_message.s); - curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, fp->completion_message.l); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, (long) fp->completion_message.l); curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback); curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp); curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); diff --git a/hts.c b/hts.c index 431d337a8..8b437f2b9 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2021 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -168,7 +168,7 @@ const char *hts_test_feature(unsigned int id) { // to find the configuration parameters. const char *hts_feature_string(void) { static char config[1200]; - const char *fmt= + const char *flags= #ifdef PACKAGE_URL "build=configure " @@ -176,12 +176,6 @@ const char *hts_feature_string(void) { "build=Makefile " #endif -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - #ifdef HAVE_LIBCURL "libcurl=yes " #else @@ -218,13 +212,21 @@ const char *hts_feature_string(void) { "bzip2=no " #endif - "htscodecs=%.40s"; +// "plugins=" must stay at the end as it is followed by "plugin-path=" +#ifdef ENABLE_PLUGINS + "plugins=yes"; +#else + "plugins=no"; +#endif #ifdef ENABLE_PLUGINS - snprintf(config, sizeof(config), fmt, - hts_plugin_path(), htscodecs_version()); + snprintf(config, sizeof(config), + "%s plugin-path=%.1000s htscodecs=%.40s", + flags, hts_plugin_path(), htscodecs_version()); #else - snprintf(config, sizeof(config), fmt, htscodecs_version()); + snprintf(config, sizeof(config), + "%s htscodecs=%.40s", + flags, htscodecs_version()); #endif return config; } @@ -415,12 +417,17 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim) return 1; } -static int -secondline_is_bases(const unsigned char *u, const unsigned char *ulim) +static int is_fastaq(const unsigned char *u, const unsigned char *ulim) { - // Skip to second line, returning false if there isn't one - u = memchr(u, '\n', ulim - u); - if (u == NULL || ++u == ulim) return 0; + const unsigned char *eol = memchr(u, '\n', ulim - u); + + // Check that the first line is entirely textual + if (! is_text_only(u, eol? eol : ulim)) return 0; + + // If the first line is very long, consider the file to indeed be FASTA/Q + if (eol == NULL) return 1; + + u = eol+1; // Now points to the first character of the second line // Scan over all base-encoding letters (including 'N' but not SEQ's '=') while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) { @@ -676,12 +683,12 @@ int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) fmt->format = hts_crypt4gh_format; return 0; } - else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) { fmt->category = sequence_data; fmt->format = fasta_format; return 0; } - else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) { fmt->category = sequence_data; fmt->format = fastq_format; return 0; @@ -1347,6 +1354,8 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, int ret = -1; if (fn2_len > sizeof(fn_buf)) { + if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning + return -1; fn2 = malloc(fn2_len); if (!fn2) return -1; } @@ -1897,7 +1906,7 @@ int hts_getline(htsFile *fp, int delimiter, kstring_t *str) case no_compression: str->l = 0; ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile); - if (ret >= 0) ret = str->l; + if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX; else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile); else ret = -1; break; @@ -4586,7 +4595,9 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) hts_idx_t *idx = idx_read(fnidx); if (!idx && !(flags & HTS_IDX_SILENT_FAIL)) - hts_log_error("Could not load local index file '%s'", fnidx); + hts_log_error("Could not load local index file '%s'%s%s", fnidx, + errno ? " : " : "", errno ? strerror(errno) : ""); + free(local_fnidx); diff --git a/hts_expr.c b/hts_expr.c index 599d7a54a..5e5a132ea 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -1,6 +1,6 @@ /* hts_expr.c -- filter expression parsing and processing. - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2022 Genome Research Ltd. Author: James Bonfield @@ -23,7 +23,6 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // TODO: -// - add maths functions. pow, sqrt, log, ? // - ?: operator for conditionals? #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h @@ -39,6 +38,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/hts_expr.h" +#include "htslib/hts_log.h" #include "textutils_internal.h" // Could also cache hts_expr_val_t stack here for kstring reuse? @@ -162,10 +162,52 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } break; + case 'd': + if (strncmp(str, "default(", 8) == 0) { + if (expression(filt, data, fn, str+8, end, res)) return -1; + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + func_ok = 1; + if (!hts_expr_val_existsT(res)) { + kstring_t swap = res->s; + *res = val; + val.s = swap; + hts_expr_val_free(&val); + } + } + break; + + case 'e': + if (strncmp(str, "exists(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = 1; + res->is_true = res->d = hts_expr_val_existsT(res); + res->is_str = 0; + } else if (strncmp(str, "exp(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = exp(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + + break; + case 'l': if (strncmp(str, "length(", 7) == 0) { if (expression(filt, data, fn, str+7, end, res)) return -1; func_ok = expr_func_length(res); + } else if (strncmp(str, "log(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = log(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); } break; @@ -178,6 +220,44 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, func_ok = expr_func_max(res); } break; + + case 'p': + if (strncmp(str, "pow(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); // arith on strings + return -1; + } else { + func_ok = 1; + res->d = pow(res->d, val.d); + hts_expr_val_free(&val); + res->is_str = 0; + } + + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; + + case 's': + if (strncmp(str, "sqrt(", 5) == 0) { + if (expression(filt, data, fn, str+5, end, res)) return -1; + func_ok = 1; + res->d = sqrt(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; } if (func_ok < 0) @@ -285,30 +365,46 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { int err; str = ws(str); - if (*str == '+') { + if (*str == '+' || *str == '-') { err = simple_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->is_true = res->d != 0; - } else if (*str == '-') { - err = simple_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->d = -res->d; - res->is_true = res->d != 0; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (*str == '-') + res->d = -res->d; + res->is_true = res->d != 0; + } } else if (*str == '!') { err = unary_expr(filt, data, fn, str+1, end, res); - if (res->is_str) { - res->is_str = 0; - res->d = 0; - res->is_true = !res->is_true; + if (res->is_true) { + // Any explicitly true value becomes false + res->d = res->is_true = 0; + } else if (!hts_expr_val_exists(res)) { + // We can also still negate undef values by toggling the + // is_true override value. + res->d = res->is_true = !res->is_true; + } else if (res->is_str) { + // !null = true, !"foo" = false, NOTE: !"" = false also + res->d = res->is_true = (res->s.s == NULL); } else { res->d = !(int64_t)res->d; res->is_true = res->d != 0; } + res->is_str = 0; } else if (*str == '~') { err = unary_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->d = ~(int64_t)res->d; - res->is_true = res->d != 0; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + res->d = ~(int64_t)res->d; + res->is_true = res->d != 0; + } + } } else { err = simple_expr(filt, data, fn, str, end, res); } @@ -335,7 +431,9 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(str); if (*str == '*' || *str == '/' || *str == '%') { if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; - if (val.is_str || res->is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } @@ -345,13 +443,18 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, res->d *= val.d; else if (*str == '/') res->d /= val.d; - else if (*str == '%') - res->d = (int64_t)res->d % (int64_t)val.d; - else + else if (*str == '%') { + if (val.d) + res->d = (int64_t)res->d % (int64_t)val.d; + else + hts_expr_val_undef(res); + } else break; + res->is_true = hts_expr_val_exists(res) && (res->d != 0); str = *end; } + hts_expr_val_free(&val); return 0; @@ -373,9 +476,12 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); + int undef = 0; if (*str == '+' || *str == '-') { if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; - if (val.is_str || res->is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } @@ -388,8 +494,14 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, else break; + if (undef) + hts_expr_val_undef(res); + else + res->is_true = res->d != 0; + str = *end; } + hts_expr_val_free(&val); return 0; @@ -405,11 +517,14 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (add_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '&' && str[1] != '&') { if (add_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -419,6 +534,8 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -433,11 +550,14 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (bitand_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '^') { if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -447,6 +567,8 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -461,11 +583,14 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (bitxor_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '|' && str[1] != '|') { if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -475,6 +600,8 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -493,33 +620,60 @@ static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(*end); hts_expr_val_t val = HTS_EXPR_VAL_INIT; - int err = 0; + int err = 0, cmp_done = 0; if (*str == '>' && str[1] == '=') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+2, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) >= 0 - : !res->is_str && !val.is_str && res->d >= val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) >= 0 + : !res->is_str && !val.is_str && res->d >= val.d; + res->is_str = 0; + } } else if (*str == '>') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+1, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) > 0 - : !res->is_str && !val.is_str && res->d > val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) > 0 + : !res->is_str && !val.is_str && res->d > val.d; + res->is_str = 0; + } } else if (*str == '<' && str[1] == '=') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+2, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) <= 0 - : !res->is_str && !val.is_str && res->d <= val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) <= 0 + : !res->is_str && !val.is_str && res->d <= val.d; + res->is_str = 0; + } } else if (*str == '<') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+1, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) < 0 - : !res->is_str && !val.is_str && res->d < val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) < 0 + : !res->is_str && !val.is_str && res->d < val.d; + res->is_str = 0; + } } + + if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); hts_expr_val_free(&val); return err ? -1 : 0; @@ -539,34 +693,45 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(*end); - int err = 0; + int err = 0, eq_done = 0; hts_expr_val_t val = HTS_EXPR_VAL_INIT; // numeric vs numeric comparison is as expected // string vs string comparison is as expected // numeric vs string is false if (str[0] == '=' && str[1] == '=') { + eq_done = 1; if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { - res->is_true = res->d = res->is_str - ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0) - : !res->is_str && !val.is_str && res->d == val.d; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0) + : !res->is_str && !val.is_str && res->d == val.d; + } } res->is_str = 0; } else if (str[0] == '!' && str[1] == '=') { + eq_done = 1; if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { - res->is_true = res->d = res->is_str - ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1) - : res->is_str != val.is_str || res->d != val.d; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1) + : res->is_str != val.is_str || res->d != val.d; + } } res->is_str = 0; } else if ((str[0] == '=' && str[1] == '~') || (str[0] == '!' && str[1] == '~')) { + eq_done = 1; err = eq_expr(filt, data, fn, str+2, end, &val); if (!val.is_str || !res->is_str) { hts_expr_val_free(&val); @@ -607,6 +772,9 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } res->is_str = 0; } + + if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); hts_expr_val_free(&val); return err ? -1 : 0; @@ -622,26 +790,47 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (eq_expr(filt, data, fn, str, end, res)) return -1; - hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { + hts_expr_val_t val = HTS_EXPR_VAL_INIT; str = ws(*end); if (str[0] == '&' && str[1] == '&') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; - res->is_true = res->d = - (res->is_true || (res->is_str && res->s.s) || res->d) && - (val.is_true || (val.is_str && val.s.s) || val.d); - res->is_str = 0; + if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) { + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + (res->is_true || (res->is_str && res->s.s) || res->d) && + (val.is_true || (val.is_str && val.s.s) || val.d); + res->is_str = 0; + } } else if (str[0] == '|' && str[1] == '|') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; - res->is_true = res->d = - res->is_true || (res->is_str && res->s.s) || res->d || - val.is_true || (val.is_str && val.s.s ) || val.d; - res->is_str = 0; + if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) { + // neither defined + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(res) && + !(val.is_true || (val.is_str && val.s.s ) || val.d)) { + // LHS undef and RHS false + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(&val) && + !(res->is_true || (res->is_str && res->s.s) || res->d)){ + // RHS undef and LHS false + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + res->is_true || (res->is_str && res->s.s) || res->d || + val.is_true || (val.is_str && val.s.s ) || val.d; + res->is_str = 0; + } } else { break; } + hts_expr_val_free(&val); } - hts_expr_val_free(&val); return 0; } @@ -677,13 +866,11 @@ void hts_filter_free(hts_filter_t *filt) { free(filt); } -int hts_filter_eval(hts_filter_t *filt, - void *data, hts_expr_sym_func *fn, - hts_expr_val_t *res) { +static int hts_filter_eval_(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { char *end = NULL; - memset(res, 0, sizeof(*res)); - filt->curr_regex = 0; if (expression(filt, data, fn, filt->str, &end, res)) return -1; @@ -694,12 +881,41 @@ int hts_filter_eval(hts_filter_t *filt, } // Strings evaluate to true. An empty string is also true, but an - // absent (null) string is false. An empty string has kstring length - // of zero, but a pointer as it's nul-terminated. - if (res->is_str) - res->is_true = res->d = res->s.s != NULL; - else + // absent (null) string is false, unless overriden by is_true. An + // empty string has kstring length of zero, but a pointer as it's + // nul-terminated. + if (res->is_str) { + res->is_true |= res->s.s != NULL; + res->d = res->is_true; + } else if (hts_expr_val_exists(res)) { res->is_true |= res->d != 0; + } return 0; } + +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) { + // As *res is cleared below, it's not safe to call this function + // with res->s.s set, as memory would be leaked. It's also not + // possible to know is res was initialised correctly, so in + // either case we fail. + hts_log_error("Results structure must be cleared before calling this function"); + return -1; + } + + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} + +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + ks_free(&res->s); + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh new file mode 100755 index 000000000..37d6bae7e --- /dev/null +++ b/hts_probe_cc.sh @@ -0,0 +1,114 @@ +#!/bin/sh + +# Check compiler options for non-configure builds and create Makefile fragment +# +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Arguments are: +# 1. C compiler command +# 2. Initial CFLAGS +# 3. LDFLAGS + +CC=$1 +CFLAGS=$2 +LDFLAGS=$3 + +# Try running the compiler. Uses the same contest.* names as +# configure for temporary files. +run_compiler () +{ + "$CC" $CFLAGS $1 $LDFLAGS -o conftest conftest.c 2> conftest.err + retval=$? + rm -f conftest.err conftest + return $retval +} + +echo "# Compiler probe results, generated by $0" + +# Check for sse4.1 etc. support + +rm -f conftest conftest.err conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + unsigned int i = _mm_popcnt_u32(1); + __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + b = _mm_shuffle_epi8(a, c); + return *((char *) &b); +} +EOF +FLAGS="-mpopcnt -msse4.1 -mssse3" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_SSE4 = $FLAGS" +fi + +# Check for avx2 + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; +} +EOF +FLAGS="-mavx2" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_AVX2 = $FLAGS" +fi + +# Check for avx512 + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); +} +EOF +FLAGS="-mavx512f" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_AVX512 = $FLAGS" +fi + +# Check for neon + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "arm_neon.h" +int main(int argc, char **argv) { + int32x4_t a = vdupq_n_s32(1); + int32x4_t b = vaddq_s32(a, a); + return *((char *) &b); +} +EOF +if run_compiler "" ; then + echo "HTS_HAVE_NEON = yes" +fi + +rm -f conftest.c diff --git a/hts_time_funcs.h b/hts_time_funcs.h new file mode 100644 index 000000000..2a0508412 --- /dev/null +++ b/hts_time_funcs.h @@ -0,0 +1,170 @@ +/* hts_time_funcs.h -- Implementations of non-standard time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + This mainly exists because timegm() is not a standard function, and so + Cannot be used in portable code. Unfortunately the standard one (mktime) + always takes the local timezone into accout so doing a UTC conversion + with it involves changing the TZ environment variable, which is rather + messy and not likely to go well with threaded code. + + The code here is a much simplified version of the BSD timegm() implementation. + It currently rejects dates before 1970, avoiding problems with -ve time_t. + It also works strictly in UTC, so doesn't have to worry about tm_isdst + which makes the calculation much easier. + + Some of this is derived from BSD sources, for example + https://github.com/NetBSD/src/blob/trunk/lib/libc/time/localtime.c + which state: + + ** This file is in the public domain, so clarified as of + ** 1996-06-05 by Arthur David Olson. + + Non-derived code is copyright as above. +*/ + +#include +#include +#include +#include + +static inline int hts_time_normalise(int *tens, int *units, int base) { + if (*units < 0 || *units >= base) { + int delta = *units >= 0 ? *units / base : (-1 - (-1 - *units) / base); + int64_t tmp = (int64_t) (*tens) + delta; + if (tmp < INT_MIN || tmp > INT_MAX) return 1; + *tens = tmp; + *units -= delta * base; + } + return 0; +} + +static inline int hts_year_is_leap(int64_t year) { + return ((year % 4 == 0) && (year % 100 != 0)) || (year % 400 == 0); +} + +// Number of leap years to start of year +// Only works for year >= 1. +static inline int64_t hts_leaps_to_year_start(int64_t year) { + --year; + return year / 4 - year / 100 + year / 400; +} + +static inline int hts_time_normalise_tm(struct tm *t) +{ + const int days_per_mon[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } + }; + const int year_days[2] = { 365, 366 }; + int overflow = 0; + int64_t year; + + if (t->tm_sec > 62) { + overflow |= hts_time_normalise(&t->tm_min, &t->tm_sec, 60); + } + overflow |= hts_time_normalise(&t->tm_hour, &t->tm_min, 60); + overflow |= hts_time_normalise(&t->tm_mday, &t->tm_hour, 24); + overflow |= hts_time_normalise(&t->tm_year, &t->tm_mon, 12); + if (overflow) + return 1; + + year = (int64_t) t->tm_year + 1900LL; + while (t->tm_mday <= 0) { + --year; + t->tm_mday += year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + } + while (t->tm_mday > 366) { + t->tm_mday -= year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + ++year; + } + for (;;) { + int mdays = days_per_mon[hts_year_is_leap(year)][t->tm_mon]; + if (t->tm_mday <= mdays) + break; + t->tm_mday -= mdays; + t->tm_mon++; + if (t->tm_mon >= 12) { + year++; + t->tm_mon = 0; + } + } + year -= 1900; + if (year != t->tm_year) { + if (year < INT_MIN || year > INT_MAX) + return 1; + t->tm_year = year; + } + return 0; +} + +/** + * Convert broken-down time to an equivalent time_t value + * @param target Target broken-down time structure + * @return Equivalent time_t value on success; -1 on failure + * + * This function first normalises the time in @p target so that the + * structure members are in the valid range. It then calculates the + * number of seconds (ignoring leap seconds) between midnight Jan 1st 1970 + * and the target date. + * + * If @p target is outside the range that can be represented in a time_t, + * or tm_year is less than 70 (which would return a negative value) then + * it returns -1 and sets errno to EOVERFLOW. + */ + +static inline time_t hts_time_gm(struct tm *target) +{ + int month_start[2][12] = { + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }, + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335 } + }; + int years_from_epoch, leaps, days; + int64_t secs; + + if (hts_time_normalise_tm(target) != 0) + goto overflow; + + if (target->tm_year < 70) + goto overflow; + + years_from_epoch = target->tm_year - 70; + leaps = (hts_leaps_to_year_start(target->tm_year + 1900) + - hts_leaps_to_year_start(1970)); + days = ((365 * (years_from_epoch - leaps) + 366 * leaps) + + month_start[hts_year_is_leap(target->tm_year + 1900)][target->tm_mon] + + target->tm_mday - 1); + secs = ((int64_t) days * 86400LL + + target->tm_hour * 3600 + + target->tm_min * 60 + + target->tm_sec); + if (sizeof(time_t) < 8 && secs > INT_MAX) + goto overflow; + + return (time_t) secs; + + overflow: + errno = EOVERFLOW; + return (time_t) -1; +} diff --git a/htscodecs b/htscodecs index 1395d7306..3ef17f6fb 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895 +Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 7242e210b..91a9c39e9 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,6 +1,6 @@ # Makefile fragment to add settings needed when bundling htscodecs functions # -# Copyright (C) 2021 Genome Research Ltd. +# Copyright (C) 2021-2022 Genome Research Ltd. # # Author: Rob Davies # @@ -28,9 +28,16 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ + $(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \ + $(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \ + $(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \ + $(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ $(HTSPREFIX)htscodecs/htscodecs/rle.c \ - $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c + $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c \ + $(HTSPREFIX)htscodecs/htscodecs/utils.c + HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) @@ -49,8 +56,11 @@ htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_permute_h = htscodecs/htscodecs/permute.h htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_static16_int_h = htscodecs/htscodecs/rANS_static16_int.h $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs_rANS_static32x16pr_h = htscodecs/htscodecs/rANS_static32x16pr.h htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) htscodecs_utils_h = htscodecs/htscodecs/utils.h htscodecs_version_h = htscodecs/htscodecs/version.h diff --git a/htsfile.1 b/htsfile.1 index 37d783a6d..cf87bef3d 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,4 +1,4 @@ -.TH htsfile 1 "7 April 2022" "htslib-1.15.1" "Bioinformatics tools" +.TH htsfile 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 279661053..9f77cb585 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,8 +1,8 @@ -.TH htslib-s3-plugin 7 "7 April 2022" "htslib-1.15.1" "Bioinformatics tools" +.TH htslib-s3-plugin 7 "18 August 2022" "htslib-1.16" "Bioinformatics tools" .SH NAME s3 plugin \- htslib AWS S3 plugin .\" -.\" Copyright (C) 2021 Genome Research Ltd. +.\" Copyright (C) 2021-2022 Genome Research Ltd. .\" .\" Author: Andrew Whitwham .\" @@ -24,6 +24,21 @@ s3 plugin \- htslib AWS S3 plugin .\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER .\" DEALINGS IN THE SOFTWARE. .\" +. +.\" For code blocks and examples (cf groff's Ultrix-specific man macros) +.de EX + +. in +\\$1 +. nf +. ft CR +.. +.de EE +. ft +. fi +. in + +.. + .SH DESCRIPTION The S3 plugin allows htslib file functions to communicate with servers that use the AWS S3 protocol. Files are identified by their bucket and object key in a @@ -114,14 +129,73 @@ files will be used. The default file locations are either \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). Entries used in aws style credentials file are aws_access_key_id, -aws_secret_access_key, aws_session_token, region and addressing_style. Only the -first two are usually needed. +aws_secret_access_key, aws_session_token, region, addressing_style and +expiry_time (unofficial, see SHORT-LIVED CREDENTIALS below). +Only the first two are usually needed. Entries used in s3cmd style config files are access_key, secret_key, access_token, host_base, bucket_location and host_bucket. Again only the first two are usually needed. The host_bucket option is only used to set a path-style URL, see below. +.SH SHORT-LIVED CREDENTIALS + +Some cloud identity and access management (IAM) systems can make short-lived +credentials that allow access to resources. +These credentials will expire after a time and need to be renewed to +give continued access. +To enable this, the S3 plugin allows an \fIexpiry_time\fR entry to be set in the +\fI.aws/credentials\fR file. +The value for this entry should be the time when the token expires, +following the format in RFC3339 section 5.6, which takes the form: + + 2012-04-29T05:20:48Z + +That is, year - month - day, the letter "T", hour : minute : second. +The time can be followed by the letter "Z", indicating the UTC timezone, +or an offset from UTC which is a "+" or "-" sign followed by two digits for +the hours offset, ":", and two digits for the minutes. + +The S3 plugin will attempt to re-read the credentials file up to 1 minute +before the given expiry time, which means the file needs to be updated with +new credentials before then. +As the exact way of doing this can vary between services and IAM providers, +the S3 plugin expects this to be done by an external user-supplied process. +This may be achieved by running a program that replaces the file as new +credentials become available. +The following script shows how it might be done for AWS instance credentials: +.EX 2 +#!/bin/sh +instance='http://169.254.169.254' +tok_url="$instance/latest/api/token" +ttl_hdr='X-aws-ec2-metadata-token-ttl-seconds: 10' +creds_url="$instance/latest/meta-data/iam/security-credentials" +key1='aws_access_key_id = \(rs(.AccessKeyId)\(rsn' +key2='aws_secret_access_key = \(rs(.SecretAccessKey)\(rsn' +key3='aws_session_token = \(rs(.Token)\(rsn' +key4='expiry_time = \(rs(.Expiration)\(rsn' +while true; do + token=`curl -X PUT -H "$ttl_hdr" "$tok_url"` + tok_hdr="X-aws-ec2-metadata-token: $token" + role=`curl -H "$tok_hdr" "$creds_url/"` + expires='now' + ( curl -H "$tok_hdr" "$creds_url/$role" \(rs + | jq -r "\(rs"${key1}${key2}${key3}${key4}\(rs"" > credentials.new ) \(rs + && mv -f credentials.new credentials \(rs + && expires=`grep expiry_time credentials | cut -d ' ' -f 3-` + if test $? -ne 0 ; then break ; fi + expiry=`date -d "$expires - 3 minutes" '+%s'` + now=`date '+%s'` + test "$expiry" -gt "$now" && sleep $((($expiry - $now) / 2)) + sleep 30 +done +.EE + +Note that the \fIexpiry_time\fR key is currently only supported for the +\fI.aws/credentials\fR file (or the file referred to in the +.B AWS_SHARED_CREDENTIALS_FILE +environment variable). + .SH NOTES In most cases this plugin transforms the given URL into a virtual host-style format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used @@ -136,4 +210,6 @@ host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. .BR htsfile (1) .BR samtools (1) .PP +RFC 3339: +.PP htslib website: diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 24d787bdf..c4ba85679 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/htslib/cram.h b/htslib/cram.h index dab666345..8dc6fe1b3 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd. Author: James Bonfield @@ -247,6 +247,46 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + *----------------------------------------------------------------------------- + * cram slice interrogation + */ + +/* + * Returns the number of cram blocks within this slice. + */ +HTSLIB_EXPORT +int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr); + +/* + * Returns the block content_id for the block containing an embedded reference + * sequence. If none is present, -1 is returned. + */ +HTSLIB_EXPORT +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h); + +/* + * Returns slice reference ID, start and span (length) coordinates. + * Return parameters may be NULL in which case they are ignored. + */ +HTSLIB_EXPORT +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span); + +/* + * Decodes a slice header from a cram block. + * Returns the opaque cram_block_slice_hdr pointer on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); + +/* + * Frees a cram_block_slice_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_slice_header(cram_block_slice_hdr *hdr); + /* *----------------------------------------------------------------------------- * cram_io basics diff --git a/htslib/hfile.h b/htslib/hfile.h index 038591cbc..6e3a2a22a 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: John Marshall @@ -158,6 +158,7 @@ static inline off_t htell(hFILE *fp) */ static inline int hgetc(hFILE *fp) { + HTSLIB_EXPORT extern int hgetc2(hFILE *); return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp); } @@ -229,6 +230,7 @@ or I/O errors. static inline ssize_t HTS_RESULT_USED hread(hFILE *fp, void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hread2(hFILE *, void *, size_t, size_t); size_t n = fp->end - fp->begin; @@ -243,6 +245,7 @@ hread(hFILE *fp, void *buffer, size_t nbytes) */ static inline int hputc(int c, hFILE *fp) { + HTSLIB_EXPORT extern int hputc2(int, hFILE *); if (fp->begin < fp->limit) *(fp->begin++) = c; else c = hputc2(c, fp); @@ -254,6 +257,7 @@ static inline int hputc(int c, hFILE *fp) */ static inline int hputs(const char *text, hFILE *fp) { + HTSLIB_EXPORT extern int hputs2(const char *, size_t, size_t, hFILE *); size_t nbytes = strlen(text), n = fp->limit - fp->begin; @@ -271,7 +275,9 @@ In the absence of I/O errors, the full _nbytes_ will be written. static inline ssize_t HTS_RESULT_USED hwrite(hFILE *fp, const void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); + HTSLIB_EXPORT extern int hfile_set_blksize(hFILE *fp, size_t bufsiz); if (!fp->mobile) { diff --git a/htslib/hts.h b/htslib/hts.h index 89ff7ea8f..38246cd9c 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1,7 +1,7 @@ /// @file htslib/hts.h /// Format-neutral I/O, indexing, and iterator API functions. /* - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Portions copyright (C) 2003-2006, 2008-2010 by Heng Li @@ -456,16 +456,19 @@ The input character may be either an IUPAC ambiguity code, '=' for 0, or '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 for A/C/G/T or combinations of these bits for ambiguous bases. */ +HTSLIB_EXPORT extern const unsigned char seq_nt16_table[256]; /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC ambiguity code letter (or '=' when given 0). */ +HTSLIB_EXPORT extern const char seq_nt16_str[]; /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). */ +HTSLIB_EXPORT extern const int seq_nt16_int[]; /*! @@ -486,7 +489,7 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 101501 +#define HTS_VERSION 101600 /*! @abstract Introspection on the features enabled in htslib * @@ -673,7 +676,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...); @param fp The file handle @param delimiter Unused, but must be '\n' (or KS_SEP_LINE) @param str The line (not including the terminator) is written here - @return Length of the string read; + @return Length of the string read (capped at INT_MAX); -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT diff --git a/htslib/hts_endian.h b/htslib/hts_endian.h index 790d2d5c6..30ad8055d 100644 --- a/htslib/hts_endian.h +++ b/htslib/hts_endian.h @@ -113,6 +113,14 @@ typedef uint64_t uint64_u; # endif #endif +/// Get a uint8_t value from an unsigned byte array +/** @param buf Pointer to source byte, may be unaligned + * @return An 8-bit unsigned integer + */ +static inline uint8_t le_to_u8(const uint8_t *buf) { + return *buf; +} + /// Get a uint16_t value from an unsigned byte array /** @param buf Pointer to source byte, may be unaligned * @return A 16 bit unsigned integer diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index d66a8edd8..43da89d6a 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -1,6 +1,6 @@ /* expr.c -- filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020, 2022 Genome Research Ltd. Author: James Bonfield @@ -25,18 +25,30 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTS_EXPR_H #define HTS_EXPR_H +#include #include "kstring.h" #include "hts_defs.h" /// Holds a filter variable. This is also used to return the results. /** - * Note we cope with zero-but-true in order to implement a basic - * "exists(something)" check where "something" may even be zero. + * The expression language has 3-states of string, numeric, and unknown. + * The unknown state is either a NaN numeric or a null string, with both + * internally considered to have the same "unknown" meaning. * - * Eg in the aux tag searching syntax, "[NM]" should return true if - * NM tag exists even if zero. - * Take care when negating this. "[NM] != 0" will be true when - * [NM] is absent, thus consider "[NM] && [NM] != 0". + * These largely match the IEE 754 semantics for NaN comparisons: <, >, ==, + * != all fail, (even NaN == NaN). Similarly arithmetic (+,-,/,*,%) with + * unknown values are still unknown (and false). + * + * The departure from NaN semantics though is that our unknown/null state is + * considered to be false while NaN in C is true. Similarly the false nature + * of our unknown state meants !val becomes true, !!val is once again false, + * val && 1 is false, val || 0 is false, and val || 1 is true along with + * !val || 0 and !val && 1. + * + * Note it is possible for empty strings and zero numbers to also be true. + * An example of this is the aux string '[NM]' which returns true if the + * NM tag is found, regardless of whether it is also zero. However the + * better approach added in 1.16 is 'exists([NM])'. */ typedef struct hts_expr_val_t { char is_str; // Use .s vs .d @@ -45,6 +57,29 @@ typedef struct hts_expr_val_t { double d; // otherwise this } hts_expr_val_t; +/// Returns true if an hts_expr_val_t is defined. +/* An example usage of this is in the SAM expression filter where an + * [X0] aux tag will be the value of X0 (string or numeric) if set, or + * a false nul-string (not the same as an empty one) when not set. + */ +static inline int hts_expr_val_exists(hts_expr_val_t *v) { + return v && !(v->is_str == 1 && v->s.s == NULL) + && !(v->is_str == 0 && isnan(v->d)); +} + +/// Returns true if an hts_expr_val_t is defined or is undef-but-true +static inline int hts_expr_val_existsT(hts_expr_val_t *v) { + return (v && v->is_true) || hts_expr_val_exists(v); +} + +/// Set a value to be undefined (nan). +static inline void hts_expr_val_undef(hts_expr_val_t *v) { + ks_clear(&v->s); + v->is_true = 0; + v->is_str = 0; + v->d = NAN; +} + /// Frees a hts_expr_val_t type. static inline void hts_expr_val_free(hts_expr_val_t *f) { ks_free(&f->s); @@ -87,11 +122,31 @@ typedef int (hts_expr_sym_func)(void *data, char *str, char **end, * the is_str member. It can also be explicitly defined to be true even * for a null value. This may be used to check for the existence of * something, irrespective of whether that something evaluates to zero. + * + * @p res must be initialized using HTS_EXPR_VAL_INIT before passing it + * to this function for the first time. + */ +HTSLIB_EXPORT +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res); + +/// Evaluate a filter expression (derecated API) +/** + * @copydetails hts_filter_eval2() + * + * If calling this function more than once with the same @p res + * parameter, hts_expr_val_free(res) must be used between invocations + * to clear any allocated memory prior to reuse. + * + * @deprecated This function has been replaced by hts_filter_eval2(), + * which clears @p res properly itself. */ HTSLIB_EXPORT int hts_filter_eval(hts_filter_t *filt, void *data, hts_expr_sym_func *sym_func, - hts_expr_val_t *res); + hts_expr_val_t *res) + HTS_DEPRECATED("Please use hts_filter_eval2 instead"); #endif /* HTS_EXPR_H */ diff --git a/htslib/hts_log.h b/htslib/hts_log.h index b2336a4df..f6a50b333 100644 --- a/htslib/hts_log.h +++ b/htslib/hts_log.h @@ -58,6 +58,7 @@ enum htsLogLevel hts_get_log_level(void); * One of the HTS_LOG_* values. The default is HTS_LOG_WARNING. * \note Avoid direct use of this variable. Use hts_set_log_level and hts_get_log_level instead. */ +HTSLIB_EXPORT extern int hts_verbose; /*! Logs an event. diff --git a/htslib/knetfile.h b/htslib/knetfile.h index cfddd6b67..0f2adec83 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2012, 2014, 2021 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014, 2021-2022 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/htslib/ksort.h b/htslib/ksort.h index 755010951..7857d4c77 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -64,6 +64,7 @@ #include #include +#include "hts_defs.h" #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) @@ -81,6 +82,7 @@ extern "C" { // problems on Windows. Don't include htslib/hts_os.h for this as it // may not get on with older attempts to fix this in code that includes // this file. +HTSLIB_EXPORT extern double hts_drand48(void); typedef struct { @@ -88,7 +90,7 @@ typedef struct { int depth; } ks_isort_stack_t; -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } +#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, , type_t, __sort_lt) #define KSORT_INIT_STATIC(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt) diff --git a/htslib/kstring.h b/htslib/kstring.h index 09bc9e3d9..53a19806d 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/htslib/sam.h b/htslib/sam.h index ea585a45a..5f8c0a554 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -118,6 +118,7 @@ typedef sam_hdr_t bam_hdr_t; Result is operator code or -1. Be sure to cast the index if it is a plain char: int op = bam_cigar_table[(unsigned char) ch]; */ +HTSLIB_EXPORT extern const int8_t bam_cigar_table[256]; #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) @@ -1524,7 +1525,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, ++s; } else if (type == 'B') { uint8_t sub_type = *(s++); - int sub_type_size; + unsigned sub_type_size; // or externalise sam.c's aux_type2size function? switch (sub_type) { @@ -1547,7 +1548,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, goto bad_aux; n = le_to_u32(s); s += 4; // now points to the start of the array - if ((end - s) / sub_type_size < n) + if ((size_t)(end - s) / sub_type_size < n) goto bad_aux; r |= kputsn_("B:", 2, ks) < 0; r |= kputc(sub_type, ks) < 0; // write the type @@ -2271,6 +2272,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); +/// Returns data about a specific modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param code Modification code. If positive this is a character code, + * if negative it is a -ChEBI code. + * + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical); + +/// Returns the list of base modification codes provided for this +/// alignment record as an array of character codes (+ve) or ChEBI numbers +/// (negative). +/* + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param ntype Filled out with the number of array elements returned + * + * @return the type array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +HTSLIB_EXPORT +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype); + #ifdef __cplusplus } #endif diff --git a/htslib/tbx.h b/htslib/tbx.h index 9b9e111b9..3d2037cbb 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -52,6 +52,7 @@ typedef struct tbx_t { void *dict; } tbx_t; +HTSLIB_EXPORT extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf; #define tbx_itr_destroy(iter) hts_itr_destroy(iter) diff --git a/htslib/vcf.h b/htslib/vcf.h index 7a001aca6..c94bea589 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -123,6 +123,7 @@ typedef struct bcf_hdr_t { int32_t m[3]; // m: allocated size of the dictionary block in use (see n above) } bcf_hdr_t; +HTSLIB_EXPORT extern uint8_t bcf_type_shift[]; /************** @@ -137,13 +138,16 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 -#define VCF_REF 0 -#define VCF_SNP 1 -#define VCF_MNP 2 -#define VCF_INDEL 4 -#define VCF_OTHER 8 -#define VCF_BND 16 // breakend -#define VCF_OVERLAP 32 // overlapping deletion, ALT=* +#define VCF_REF 0 +#define VCF_SNP (1<<0) +#define VCF_MNP (1<<1) +#define VCF_INDEL (1<<2) +#define VCF_OTHER (1<<3) +#define VCF_BND (1<<4) // breakend +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_INS (1<<6) // implies VCF_INDEL +#define VCF_DEL (1<<7) // implies VCF_INDEL +#define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -749,15 +753,102 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); + /// Get variant types in a BCF record /** - * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + * @param rec BCF/VCF record + * @return Types of variant present + * + * The return value will be a bitwise-or of VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. If will return + * VCF_REF (i.e. 0) if none of the other types is present. + * @deprecated Please use bcf_has_variant_types() instead */ HTSLIB_EXPORT int bcf_get_variant_types(bcf1_t *rec); + /// Get variant type in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @return Type of variant present + * + * The return value will be one of VCF_REF, VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. + * @deprecated Please use bcf_has_variant_type() instead + */ HTSLIB_EXPORT int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + /// Match mode for bcf_has_variant_types() + enum bcf_variant_match { + bcf_match_exact, ///< Types present exactly match tested for + bcf_match_overlap, ///< At least one variant type in common + bcf_match_subset, ///< Test set is a subset of types present + }; + + /// Check for presence of variant types in a BCF record + /** + * @param rec BCF/VCF record + * @param bitmask Set of variant types to test for + * @param mode Match mode + * @return >0 if the variant types are present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, in + * bcf_match_overlap mode, and then check for them individually in the + * returned value. + * + * As VCF_REF is represented by 0 (i.e. the absence of other variants) + * it should be tested for using + * bcf_has_variant_types(rec, VCF_REF, bcf_match_exact) + * which will return 1 if no other variant type is present, otherwise 0. + */ + HTSLIB_EXPORT + int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, enum bcf_variant_match mode); + + /// Check for presence of variant types in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @param bitmask Set of variant types to test for + * @return >0 if one of the variant types is present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for, or VCF_REF on its own. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, and then + * check for them individually in the returned value. + * + * As a special case, if @p bitmask is VCF_REF (i.e. 0), the function + * tests for an exact match. The return value will be 1 if the + * variant type calculated for the allele is VCF_REF, otherwise if + * any other type is present it will be 0. + */ + HTSLIB_EXPORT + int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask); + + /// Return the number of bases affected by a variant, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele index + * @return The number of bases affected (negative for deletions), + * or bcf_int32_missing on error. + */ + HTSLIB_EXPORT + int bcf_variant_length(bcf1_t *rec, int ith_allele); + HTSLIB_EXPORT int bcf_is_snp(bcf1_t *v); @@ -1341,7 +1432,9 @@ which works for both BCF and VCF. #define BCF_MIN_BT_INT16 (-32760) /* INT16_MIN + 8 */ #define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */ +HTSLIB_EXPORT extern uint32_t bcf_float_vector_end; +HTSLIB_EXPORT extern uint32_t bcf_float_missing; static inline void bcf_float_set(float *ptr, uint32_t value) { @@ -1367,21 +1460,23 @@ static inline int bcf_float_is_vector_end(float f) static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { uint32_t e = 0; - #define BRANCH(type_t, missing, vector_end) { \ - type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ int i; \ - for (i=0; in && ptr[i]!=vector_end; i++) \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ { \ - if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \ - if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((ptr[i]>>1) - 1, str) < 0; \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if ( i ) e |= kputc("/|"[val&1], str) < 0; \ + if ( !(val>>1) ) e |= kputc('.', str) < 0; \ + else e |= kputw((val>>1) - 1, str) < 0; \ } \ if (i == 0) e |= kputc('.', str) < 0; \ } switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; case BCF_BT_NULL: e |= kputc('.', str) < 0; break; default: hts_log_error("Unexpected type %d", fmt->type); return -2; } diff --git a/htslib_vars.mk b/htslib_vars.mk index 1f4c0905a..6af71863c 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -42,7 +42,7 @@ htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h -htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h +htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_defs_h) htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h) htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h) diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4 new file mode 100644 index 000000000..16bb46495 --- /dev/null +++ b/m4/ax_check_compile_flag.m4 @@ -0,0 +1,53 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the current language's compiler +# or gives an error. (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# INPUT gives an alternative input source to AC_COMPILE_IFELSE. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 6 + +AC_DEFUN([AX_CHECK_COMPILE_FLAG], +[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF +AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl +AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ + ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" + AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) +AS_VAR_IF(CACHEVAR,yes, + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_COMPILE_FLAGS diff --git a/sam.c b/sam.c index 0ce3a42cb..c95d1c693 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -339,17 +339,23 @@ int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) if (h->hrecs) { if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; - if (hdr_ks.l > INT32_MAX) { + if (hdr_ks.l > UINT32_MAX) { hts_log_error("Header too long for BAM format"); free(hdr_ks.s); return -1; + } else if (hdr_ks.l > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = hdr_ks.s; l_text = hdr_ks.l; } else { - if (h->l_text > INT32_MAX) { + if (h->l_text > UINT32_MAX) { hts_log_error("Header too long for BAM format"); return -1; + } else if (h->l_text > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = h->text; l_text = h->l_text; @@ -1348,6 +1354,33 @@ static int bam_sym_lookup(void *data, char *str, char **end, res->s.l = b->core.l_qseq; res->is_str = 1; return 0; + } else if (memcmp(str, "sclen", 5) == 0) { + int sclen = 0; + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + int left = 0; + + // left + if (ncigar > 0 + && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) + left = 0, sclen += bam_cigar_oplen(cigar[0]); + else if (ncigar > 1 + && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) + left = 1, sclen += bam_cigar_oplen(cigar[1]); + + // right + if (ncigar-1 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-1]); + else if (ncigar-2 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-2]); + + *end = str+5; + res->d = sclen; + return 0; } break; @@ -1421,8 +1454,8 @@ static int bam_sym_lookup(void *data, char *str, char **end, int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) { hb_pair hb = {h, b}; - hts_expr_val_t res; - if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) { + hts_expr_val_t res = HTS_EXPR_VAL_INIT; + if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { hts_log_error("Couldn't process filter expression"); hts_expr_val_free(&res); return -1; @@ -3448,6 +3481,8 @@ static void *sam_dispatcher_write(void *vp) { i++; if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) + goto err; if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) goto err; } else { @@ -3487,8 +3522,69 @@ static void *sam_dispatcher_write(void *vp) { pthread_mutex_unlock(&fd->lines_m); } else { if (fp->is_bgzf) { - if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size) - goto err; + // We keep track of how much in the current block we have + // remaining => R. We look for the last newline in input + // [i] to [i+R], backwards => position N. + // + // If we find a newline, we write out bytes i to N. + // We know we cannot fit the next record in this bgzf block, + // so we flush what we have and copy input N to i+R into + // the start of a new block, and recompute a new R for that. + // + // If we don't find a newline (i==N) then we cannot extend + // the current block at all, so flush whatever is in it now + // if it ends on a newline. + // We still copy i(==N) to i+R to the next block and + // continue as before with a new R. + // + // The only exception on the flush is when we run out of + // data in the input. In that case we skip it as we don't + // yet know if the next record will fit. + // + // Both conditions share the same code here: + // - Look for newline (pos N) + // - Write i to N (which maybe 0) + // - Flush if block ends on newline and not end of input + // - write N to i+R + + int i = 0; + BGZF *fb = fp->fp.bgzf; + while (i < gl->data_size) { + // remaining space in block + int R = BGZF_BLOCK_SIZE - fb->block_offset; + int eod = 0; + if (R > gl->data_size-i) + R = gl->data_size-i, eod = 1; + + // Find last newline in input data + int N = i + R; + while (--N > i) { + if (gl->data[N] == '\n') + break; + } + + if (N != i) { + // Found a newline + N++; + if (bgzf_write(fb, &gl->data[i], N-i) != N-i) + goto err; + } + + // Flush bgzf block + int b_off = fb->block_offset; + if (!eod && b_off && + ((char *)fb->uncompressed_block)[b_off-1] == '\n') + if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) + goto err; + + // Copy from N onwards into next block + if (i+R > N) + if (bgzf_write(fb, &gl->data[N], i+R - N) + != i+R - N) + goto err; + + i = i+R; + } } else { if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) goto err; @@ -4005,7 +4101,7 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { fp->line.l = 0; if (ret < 0) { hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; + if (h && h->ignore_sam_err) goto err_recover; } } @@ -4354,6 +4450,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (sam_format1(h, b, &fp->line) < 0) return -1; kputc('\n', &fp->line); if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; } else { if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; @@ -4393,6 +4491,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (fastq_format1(fp->state, b, &fp->line) < 0) return -1; if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) return -1; } else { @@ -5233,6 +5333,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod mod[256]; if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, m, mod, 256)) > 0) { + int o_indel = indel; if (ks_resize(ins, ins->l + nm*16+3) < 0) return -1; ins->s[indel++] = '['; @@ -5256,6 +5357,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, qual); } ins->s[indel++] = ']'; + ins->l += indel - o_indel; // grow by amount we used } } break; @@ -6019,6 +6121,7 @@ struct hts_base_mod_state { char *MMend[MAX_BASE_MOD]; // end of pos-delta string uint8_t *ML[MAX_BASE_MOD]; // next qual int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? int seq_pos; // current position along sequence int nmods; // used array size (0 to MAX_BASE_MOD-1). }; @@ -6087,9 +6190,10 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { char *cp = (char *)mm+1; int mod_num = 0; + int implicit = 1; while (*cp) { for (; *cp; cp++) { - // cp should be [ACGTNU][+-][^,]*(,\d+)*; + // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; unsigned char btype = *cp++; if (btype != 'A' && btype != 'C' && @@ -6109,18 +6213,31 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { char *ms = cp, *me; // mod code start and end char *cp_end = NULL; int chebi = 0; - if (isdigit(*cp)) { + if (isdigit_c(*cp)) { chebi = strtol(cp, &cp_end, 10); cp = cp_end; ms = cp-1; } else { - while (*cp && *cp != ',' && *cp != ';') + while (*cp && isalpha_c(*cp)) cp++; if (*cp == '\0') return -1; } + me = cp; + // Optional explicit vs implicit marker + if (*cp == '.') { + // default is implicit = 1; + cp++; + } else if (*cp == '?') { + implicit = 0; + cp++; + } else if (*cp != ',' && *cp != ';') { + // parse error + return -1; + } + long delta; int n = 0; // nth symbol in a multi-mod string int stride = me-ms; @@ -6170,7 +6287,13 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->strand [mod_num] = (strand == '-'); state->canonical[mod_num] = btype; state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; + if (delta < 0) { + hts_log_error("MM tag refers to bases beyond sequence " + "length"); + return -1; + } state->MMcount [mod_num] = delta; if (b->core.flag & BAM_FREVERSE) { state->MM [mod_num] = cp+1; @@ -6386,3 +6509,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, return r; } + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/tabix.1 b/tabix.1 index 2c012442c..15ac768e7 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,4 +1,4 @@ -.TH tabix 1 "7 April 2022" "htslib-1.15.1" "Bioinformatics tools" +.TH tabix 1 "18 August 2022" "htslib-1.16" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files @@ -169,7 +169,7 @@ The default is 3, which turns on error and warning messages; Values higher than 3 produce additional informational and debugging messages. .PP .SH EXAMPLE -(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; +(grep "^#" in.gff; grep -v "^#" in.gff | sort -t"`printf '\(rst'`" -k1,1 -k4,4n) | bgzip > sorted.gff.gz; tabix -p gff sorted.gff.gz; diff --git a/tbx.c b/tbx.c index f0310a257..3af2c09fb 100644 --- a/tbx.c +++ b/tbx.c @@ -93,12 +93,11 @@ int tbx_name2id(tbx_t *tbx, const char *ss) int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) { - int i, b = 0, id = 1, ncols = 0; + int i, b = 0, id = 1; char *s; intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { if (line[i] == '\t' || line[i] == 0) { - ++ncols; if (id == conf->sc) { intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index cefdc545c..a6e7654cf 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -35,6 +35,7 @@ 34 C C+m204 C+(76792)33 35 A --- +Present: m #-76792 n 6 C C+m102 15 N N+n212 17 C C+m128 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index 82d086a2f..e21ae314e 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -35,6 +35,7 @@ 34 A 35 T --- +Present: m m o 1 G G-m115 7 C C+m128 12 G G-m141 diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out new file mode 100644 index 000000000..4abedc719 --- /dev/null +++ b/test/base_mods/MM-explicit-x.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out new file mode 100644 index 000000000..f28b25f83 --- /dev/null +++ b/test/base_mods/MM-explicit.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam new file mode 100644 index 000000000..e85afa293 --- /dev/null +++ b/test/base_mods/MM-explicit.sam @@ -0,0 +1,27 @@ +@CO Testing explicit vs implicit base modifications. +@CO This covers the case where a lack of a signal could be either +@CO implicitly assumed to be no-mod (default) or assumed to be +@CO unchecked and require an explicit statement to indicate it was +@CO looked at and no base modification was observed. +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit +@CO - - .. -. - -- +@CO Mm M +@CO - - .. -. - -- +@CO hH h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region +@CO - - ?? ?? ? -- +@CO Mm mM m +@CO - - ?? ?? ? -- +@CO hH hh h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture +@CO - - . -. - -- +@CO M M +@CO - - ?? ?? ? -- +@CO hH hh h -- +@CO +r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 +r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5 +r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index 23c98d97b..e411a81ee 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -35,6 +35,7 @@ 34 C C+m230 C+h6 35 A --- +Present: m h n 6 C C+m128 15 N N+n215 17 C C+m153 @@ -83,6 +84,7 @@ 34 C C+m204 C+h31 35 A --- +Present: m h n 6 C C+m77 C+h159 15 N N+n240 17 C C+m103 C+h133 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 865a539c7..3809c0e6e 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -33,9 +33,11 @@ # samtools binary. This can be useful for testing older versions. # Test files from SAM spec -P MM-chebi.out $test_mod MM-chebi.sam -P MM-double.out $test_mod MM-double.sam -P MM-multi.out $test_mod MM-multi.sam +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam +P MM-explicit.out $test_mod MM-explicit.sam +P MM-explicit-x.out $test_mod -x MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst index 966f0ed8a..3b5fd9f4f 100644 --- a/test/fastq/fastq.tst +++ b/test/fastq/fastq.tst @@ -44,6 +44,9 @@ P minimal-q.sam $tview minimal.fa P multiline.sam $tview multiline.fq P multiline-q.sam $tview multiline.fa +# FASTQ with a very long header line +P longline.sam $tview -i fastq_aux longline.fq + # Single file, unpaired data, with / without aux tags P single_noaux.sam $tview single.fq P single_noaux-q.sam $tview single.fa diff --git a/test/fastq/longline.fq b/test/fastq/longline.fq new file mode 100644 index 000000000..09cabd1a3 --- /dev/null +++ b/test/fastq/longline.fq @@ -0,0 +1,4 @@ +@readname XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab +ATGC ++ +qqqq diff --git a/test/fastq/longline.sam b/test/fastq/longline.sam new file mode 100644 index 000000000..4dc5e8215 --- /dev/null +++ b/test/fastq/longline.sam @@ -0,0 +1 @@ +readname 4 * 0 0 * * 0 0 ATGC qqqq XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab diff --git a/test/index.vcf.gz.csi b/test/index.vcf.gz.csi index 644832d83..250339624 100644 Binary files a/test/index.vcf.gz.csi and b/test/index.vcf.gz.csi differ diff --git a/test/index.vcf.gz.tbi b/test/index.vcf.gz.tbi index 4d6e99781..e9ab7b60d 100644 Binary files a/test/index.vcf.gz.tbi and b/test/index.vcf.gz.tbi differ diff --git a/test/sam.c b/test/sam.c index cc5bfe77a..036349f2b 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014-2020 Genome Research Ltd. + Copyright (C) 2014-2020, 2022 Genome Research Ltd. Author: John Marshall @@ -1525,7 +1525,11 @@ static void test_text_file(const char *filename, int nexp) if (in) { kstring_t str = KS_INITIALIZE; int ret, n = 0; - while ((ret = hts_getline(in, '\n', &str)) >= 0) n++; + while ((ret = hts_getline(in, '\n', &str)) >= 0) { + size_t len = strlen(str.s); + n++; + if (ret != len) fail("hts_getline read length %d (expected %zu)", ret, len); + } if (ret != -1) fail("hts_getline got an error from %s", filename); if (n != nexp) fail("hts_getline read %d lines from %s (expected %d)", n, filename, nexp); diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index effb77a26..129516b24 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -53,3 +53,6 @@ P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' +P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@' +P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | egrep -v '^@' +P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | egrep -v '^@' diff --git a/test/sam_filter/func5.out b/test/sam_filter/func5.out new file mode 100644 index 000000000..6c2e2bc64 --- /dev/null +++ b/test/sam_filter/func5.out @@ -0,0 +1,5 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.12704932 163 17 1 29 36S64M = 195 293 TGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTG BFAFGFEIGFEFHHEIDKJGHHHJIIE=@KKGGKJGIBLLMFKMDIIHJKKHFELLLKFIHMHIHHIHLKJFCHFJIJAID=JHKFGHJIHKKCH:@HD? MD:Z:64 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func6.out b/test/sam_filter/func6.out new file mode 100644 index 000000000..de091ed96 --- /dev/null +++ b/test/sam_filter/func6.out @@ -0,0 +1,2 @@ +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func7.out b/test/sam_filter/func7.out new file mode 100644 index 000000000..1fe2500bf --- /dev/null +++ b/test/sam_filter/func7.out @@ -0,0 +1,3 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/tabix/vcf_file.bcf b/test/tabix/vcf_file.bcf index 75a64b38c..a4aafec47 100644 Binary files a/test/tabix/vcf_file.bcf and b/test/tabix/vcf_file.bcf differ diff --git a/test/tabix/vcf_file.vcf b/test/tabix/vcf_file.vcf index de0a7c7b6..d3cf30fc8 100644 --- a/test/tabix/vcf_file.vcf +++ b/test/tabix/vcf_file.vcf @@ -35,3 +35,4 @@ 2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:322:26 1/2:322:26 3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:91:26 1/2:91:26 4 3258448 . TACACACAC T . PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31 +4 3258501 . C A,T,G,CA,CT,CG,CC,CAA,CAT,CAG,CAC,CTA,CTT,CTG,CTC,CGA,CGT,CGG,CGC,CCA,CCT,CCG,CCC,CAAA,CAAT,CAAG,CAAC,CATA,CATT,CATG,CATC,CAGA,CAGT,CAGG,CAGC,CACA,CACT,CACG,CACC,CTAA,CTAT,CTAG,CTAC,CTTA,CTTT,CTTG,CTTC,CTGA,CTGT,CTGG,CTGC,CTCA,CTCT,CTCG,CTCC,CGAA,CGAT,CGAG,CGAC,CGTA,CGTT,CGTG,CGTC,CGGA,CGGT,CGGG,CGGC,CGCA,CGCT,CGCG,CGCC,CCAA,CCAT,CCAG,CCAC,CCTA,CCTT,CCTG,CCTC,CCGA,CCGT,CCGG,CCGC,CCCA,CCCT,CCCG,CCCC,CAAAA,CAAAT,CAAAG,CAAAC,CAATA,CAATT,CAATG,CAATC,CAAGA,CAAGT,CAAGG,CAAGC,CAACA,CAACT,CAACG,CAACC,CATAA,CATAT,CATAG,CATAC,CATTA,CATTT,CATTG,CATTC,CATGA,CATGT,CATGG,CATGC,CATCA,CATCT,CATCG,CATCC,CAGAA,CAGAT,CAGAG,CAGAC,CAGTA,CAGTT,CAGTG,CAGTC,CAGGA,CAGGT,CAGGG,CAGGC,CAGCA,CAGCT,CAGCG,CAGCC,CACAA,CACAT,CACAG,CACAC,CACTA,CACTT,CACTG,CACTC,CACGA,CACGT,CACGG,CACGC,CACCA,CACCT,CACCG,CACCC,CTAAA,CTAAT,CTAAG,CTAAC,CTATA,CTATT,CTATG,CTATC,CTAGA,CTAGT,CTAGG,CTAGC,CTACA,CTACT,CTACG,CTACC,CTTAA,CTTAT,CTTAG,CTTAC,CTTTA,CTTTT,CTTTG,CTTTC,CTTGA,CTTGT,CTTGG,CTTGC,CTTCA,CTTCT,CTTCG,CTTCC,CTGAA,CTGAT,CTGAG,CTGAC,CTGTA,CTGTT,CTGTG,CTGTC,CTGGA,CTGGT,CTGGG,CTGGC,CTGCA,CTGCT,CTGCG,CTGCC,CTCAA,CTCAT,CTCAG,CTCAC,CTCTA,CTCTT,CTCTG,CTCTC,CTCGA,CTCGT,CTCGG,CTCGC,CTCCA,CTCCT,CTCCG,CTCCC,CGAAA,CGAAT,CGAAG,CGAAC,CGATA,CGATT,CGATG,CGATC,CGAGA,CGAGT,CGAGG,CGAGC,CGACA,CGACT,CGACG,CGACC,CGTAA,CGTAT,CGTAG,CGTAC,CGTTA,CGTTT,CGTTG,CGTTC,CGTGA,CGTGT,CGTGG,CGTGC,CGTCA,CGTCT,CGTCG,CGTCC,CGGAA,CGGAT,CGGAG,CGGAC,CGGTA,CGGTT,CGGTG,CGGTC,CGGGA,CGGGT,CGGGG,CGGGC,CGGCA,CGGCT,CGGCG,CGGCC,CGCAA,CGCAT,CGCAG,CGCAC,CGCTA,CGCTT,CGCTG,CGCTC,CGCGA,CGCGT,CGCGG,CGCGC,CGCCA,CGCCT,CGCCG,CGCCC,CCAAA,CCAAT,CCAAG,CCAAC,CCATA,CCATT,CCATG,CCATC,CCAGA,CCAGT,CCAGG,CCAGC,CCACA,CCACT,CCACG,CCACC,CCTAA,CCTAT,CCTAG,CCTAC,CCTTA,CCTTT,CCTTG,CCTTC,CCTGA,CCTGT 45 PASS AN=4;AC=2 GT 0/300 240/260 diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c new file mode 100644 index 000000000..e5092084e --- /dev/null +++ b/test/test-bcf_set_variant_type.c @@ -0,0 +1,135 @@ +/* test/test-bcf_set_variant_type.c -- bcf_set_variant_type test harness. + + Copyright (C) 2022 Genome Research Ltd. + + Author: Martin Pollard + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include + +#include "../htslib/hts.h" +#include "../vcf.c" + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + if (strrchr(format, '\n') == NULL) fputc('\n', stderr); + exit(-1); +} + +static void test_bcf_set_variant_type() +{ + // Test SNVs + bcf_variant_t var1; + bcf_set_variant_type("A", "T", &var1); + if ( var1.type != VCF_SNP) + { + error("A -> T was not detected as a SNP"); + } + + // Test INDEL + bcf_variant_t var2a; + bcf_set_variant_type("A", "AA", &var2a); + if ( var2a.type != (VCF_INDEL|VCF_INS) ) + { + error("A -> AA was not detected as an INDEL"); + } + bcf_variant_t var2b; + bcf_set_variant_type("AA", "A", &var2b); + if ( var2b.type != (VCF_INDEL|VCF_DEL) ) + { + error("AA -> A was not detected as a INDEL"); + } + + // Test breakends + bcf_variant_t var3a; + bcf_set_variant_type("N", "N]16:33625444]", &var3a); + if ( var3a.type != VCF_BND) + { + error("N]16:33625444] was not detected as a breakend"); + } + + bcf_variant_t var3b; + bcf_set_variant_type("N", "N[16:33625444[", &var3b); + if (var3b.type != VCF_BND) + { + error("N[16:33625444[ was not detected as a breakend"); + } + + bcf_variant_t var3c; + bcf_set_variant_type("N", "]16:33625444]N", &var3c); + if ( var3c.type != VCF_BND) + { + error("]16:33625444]N was not detected as a breakend"); + } + + bcf_variant_t var3d; + bcf_set_variant_type("N", "[16:33625444[N", &var3d); + if ( var3d.type != VCF_BND) + { + error("[16:33625444[N was not detected as a breakend"); + } + // Test special reference alleles + bcf_variant_t var4a; + bcf_set_variant_type("A", "", &var4a); + if ( var4a.type != VCF_REF) + { + error(" was not detected as a special reference allele"); + } + bcf_variant_t var4b; + bcf_set_variant_type("A", "<*>", &var4b); + if ( var4b.type != VCF_REF) + { + error("<*> was not detected as a special reference allele"); + } + // Test MNP + bcf_variant_t var5; + bcf_set_variant_type("AA", "TT", &var5); + if ( var5.type != VCF_MNP) + { + error("AA->TT was not detected as a MNP"); + } + // Test Overlapping allele + bcf_variant_t var6; + bcf_set_variant_type("A", "*", &var6); + if ( var6.type != VCF_OVERLAP) + { + error("A->* was not detected as an overlap"); + } + // Test . + bcf_variant_t var7; + bcf_set_variant_type("A", ".", &var7); + if ( var7.type != VCF_REF) + { + error("A->. was not detected as a special reference allele"); + } +} + +int main(int argc, char **argv) +{ + test_bcf_set_variant_type(); + return 0; +} + diff --git a/test/test-logging.pl b/test/test-logging.pl index 1040b0e47..2f22560b5 100755 --- a/test/test-logging.pl +++ b/test/test-logging.pl @@ -33,7 +33,7 @@ sub check_log_message my ($message, $filename, $line_num) = @_; $log_message_count++; - unless ($message =~ /^\"([A-Z]|%s)/) + unless ($message =~ /^\"([A-Z!-@]|%s)/) { print "$filename line $line_num:\n"; print "Log message should begin with a capital letter: $message.\n"; diff --git a/test/test.pl b/test/test.pl index ff5601b0d..d6c01786a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2021 Genome Research Ltd. +# Copyright (C) 2012-2022 Genome Research Ltd. # # Author: Petr Danecek # @@ -32,6 +32,7 @@ use IO::Handle; my $opts = parse_params(); +srand($$opts{seed}); test_bgzip($opts, 0); test_bgzip($opts, 4); @@ -59,6 +60,7 @@ test_logging($opts); test_plugin_loading($opts); test_realn($opts); +test_bcf_set_variant_type($opts); print "\nNumber of tests:\n"; printf " total .. %d\n", $$opts{nok}+$$opts{nfailed}; @@ -79,6 +81,7 @@ sub error "Usage: test.pl [OPTIONS]\n", "Options:\n", " -r, --redo-outputs Recreate expected output files.\n", + " -s, --random-seed Initialise rand() with a different seed.\n", " -t, --temp-dir When given, temporary files will not be removed.\n", " -f, --fail-fast Fail-fast mode: exit as soon as a test fails.\n", " -h, -?, --help This help message.\n", @@ -104,12 +107,13 @@ sub safe_tempdir sub parse_params { - my $opts = { keep_files=>0, nok=>0, nfailed=>0 }; + my $opts = { keep_files=>0, nok=>0, nfailed=>0, seed=>42 }; my $help; Getopt::Long::Configure('bundling'); my $ret = GetOptions ( 't|temp-dir:s' => \$$opts{keep_files}, 'r|redo-outputs' => \$$opts{redo_outputs}, + 's|random-seed=i' => \$$opts{seed}, 'f|fail-fast' => \$$opts{fail_fast}, 'h|?|help' => \$help ); @@ -614,7 +618,7 @@ sub test_view ## Experimental CRAM 4.0 support. # SAM -> CRAM40 -> SAM - my @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" + @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" ? (qw/fast normal small archive/) : (qw/archive/); foreach my $profile (@p) { @@ -633,6 +637,14 @@ sub test_view testv $opts, "./compare_sam.pl -Baux $md $sam $jsam"; } + # embed_ref=2 mode + my $ersam = "ce#1000.sam"; + my $ercram = "ce#1000_er.tmp.cram"; + my $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { passed($opts, "$sam conversions"); @@ -974,7 +986,7 @@ sub test_bcf_sr_sort my ($opts, %args) = @_; for (my $i=0; $i<10; $i++) { - my $seed = int(rand(time)); + my $seed = int(rand(100000000)); my $test = 'test-bcf-sr'; my $cmd = "$$opts{path}/test-bcf-sr.pl -t $$opts{tmp} -s $seed"; print "$test:\n"; @@ -1052,3 +1064,17 @@ sub test_realn { # Revert quality values (using data in ZQ tags) test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam"); } + +sub test_bcf_set_variant_type +{ + my ($opts) = @_; + my $test = 'test-bcf_set_variant_type'; + my $cmd = "$$opts{path}/test-bcf_set_variant_type"; + print "$test:\n"; + print "\t$cmd\n"; + my ($ret,$out) = _cmd($cmd); + if ( $ret ) { + print $out; + failed($opts,$test); + } else { passed($opts,$test); } +} diff --git a/test/test_expr.c b/test/test_expr.c index 606a9b3b5..ecd1232e4 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -1,6 +1,6 @@ /* test-expr.c -- Testing: filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020, 2022 Genome Research Ltd. Author: James Bonfield @@ -51,16 +51,34 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { *end = str+5; res->is_str = 1; kputs("plugh", ks_clear(&res->s)); + } else if (strncmp(str, "empty-but-true", 14) == 0) { + // empty string + *end = str+14; + res->is_true = 1; + res->is_str = 1; + kputs("", ks_clear(&res->s)); } else if (strncmp(str, "empty", 5) == 0) { // empty string *end = str+5; res->is_str = 1; kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "zero-but-true", 13) == 0) { + *end = str+13; + res->d = 0; + res->is_true = 1; + } else if (strncmp(str, "null-but-true", 13) == 0) { + *end = str+13; + hts_expr_val_undef(res); + res->is_true = 1; } else if (strncmp(str, "null", 4) == 0) { // null string (eg aux:Z tag is absent) *end = str+4; - res->is_str = 1; - ks_clear(&res->s); + hts_expr_val_undef(res); + } else if (strncmp(str, "nan", 3) == 0) { + // sqrt(-1), 0/0 and similar + // Semantically the same operations as null. + *end = str+3; + hts_expr_val_undef(res); } else { return -1; @@ -70,155 +88,270 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { } typedef struct { + int truth_val; double dval; char *sval; char *str; } test_ev; +static inline int strcmpnull(const char *a, const char *b) { + if (!a && !b) return 0; + if (!a && b) return -1; + if (a && !b) return 1; + return strcmp(a, b); +} + +// Compare NAN as equal, for testing we returned the correct values +static inline int cmpfloat(double d1, double d2) { + // If needs be, can use DBL_EPSILON in comparisons here. + return d1 == d2 || (isnan(d1) && isnan(d2)); +} + int test(void) { // These are all valid expressions that should work test_ev tests[] = { - { 1, NULL, "1"}, - { 1, NULL, "+1"}, - { -1, NULL, "-1"}, - { 0, NULL, "!7"}, - { 1, NULL, "!0"}, - { 1, NULL, "!(!7)"}, - { 1, NULL, "!!7"}, - - { 5, NULL, "2+3"}, - { -1, NULL, "2+-3"}, - { 6, NULL, "1+2+3"}, - { 1, NULL, "-2+3"}, - - { 6, NULL, "2*3"}, - { 6, NULL, "1*2*3"}, - { 0, NULL, "2*0"}, - - { 7, NULL, "(7)"}, - { 7, NULL, "((7))"}, - { 21, NULL, "(1+2)*(3+4)"}, - { 14, NULL, "(4*5)-(-2*-3)"}, - - { 1, NULL, "(1+2)*3==9"}, - { 1, NULL, "(1+2)*3!=8"}, - { 0, NULL, "(1+2)*3!=9"}, - { 0, NULL, "(1+2)*3==8"}, - - { 0, NULL, "1>2"}, - { 1, NULL, "1<2"}, - { 0, NULL, "3<3"}, - { 0, NULL, "3>3"}, - { 1, NULL, "9<=9"}, - { 1, NULL, "9>=9"}, - { 1, NULL, "2*4==8"}, - { 1, NULL, "16==0x10"}, - { 1, NULL, "15<0x10"}, - { 1, NULL, "17>0x10"}, - { 0, NULL, "2*4!=8"}, - { 1, NULL, "4+2<3+4"}, - { 0, NULL, "4*2<3+4"}, - { 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 - - { 1, NULL, "(1<2) == (3>2)"}, - { 1, NULL, "1<2 == 3>2"}, - - { 1, NULL, "2 && 1"}, - { 0, NULL, "2 && 0"}, - { 0, NULL, "0 && 2"}, - { 1, NULL, "2 || 1"}, - { 1, NULL, "2 || 0"}, - { 1, NULL, "0 || 2"}, - { 1, NULL, "1 || 2 && 3"}, - { 1, NULL, "2 && 3 || 1"}, - { 1, NULL, "0 && 3 || 2"}, - { 0, NULL, "0 && 3 || 0"}, - - { 1, NULL, "3 & 1"}, - { 2, NULL, "3 & 2"}, - { 3, NULL, "1 | 2"}, - { 3, NULL, "1 | 3"}, - { 7, NULL, "1 | 6"}, - { 2, NULL, "1 ^ 3"}, - - { 1, NULL, "(1^0)&(4^3)"}, - { 2, NULL, "1 ^(0&4)^ 3"}, - { 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ - - { 6, NULL, "(1|0)^(4|3)"}, - { 7, NULL, "1 |(0^4)| 3"}, - { 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | - - { 1, NULL, "4 & 2 || 1"}, - { 1, NULL, "(4 & 2) || 1"}, - { 0, NULL, "4 & (2 || 1)"}, - { 1, NULL, "1 || 4 & 2"}, - { 1, NULL, "1 || (4 & 2)"}, - { 0, NULL, "(1 || 4) & 2"}, - - { 1, NULL, " (2*3)&7 > 4"}, - { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv - { 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv - { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, - - { 1, "plugh", "magic"}, - { 1, "", "empty"}, - { 1, NULL, "magic == \"plugh\""}, - { 1, NULL, "magic != \"xyzzy\""}, - - { 1, NULL, "\"abc\" < \"def\""}, - { 1, NULL, "\"abc\" <= \"abc\""}, - { 0, NULL, "\"abc\" < \"ab\""}, - { 0, NULL, "\"abc\" <= \"ab\""}, - - { 0, NULL, "\"abc\" > \"def\""}, - { 1, NULL, "\"abc\" >= \"abc\""}, - { 1, NULL, "\"abc\" > \"ab\""}, - { 1, NULL, "\"abc\" >= \"ab\""}, - - { 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, - { 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, - { 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, - { 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + { 1, 1, NULL, "1"}, + { 1, 1, NULL, "+1"}, + { 1, -1, NULL, "-1"}, + { 0, 0, NULL, "!7"}, + { 1, 1, NULL, "!0"}, + { 1, 1, NULL, "!(!7)"}, + { 1, 1, NULL, "!!7"}, + + { 1, 5, NULL, "2+3"}, + { 1, -1, NULL, "2+-3"}, + { 1, 6, NULL, "1+2+3"}, + { 1, 1, NULL, "-2+3"}, + { 0, NAN, NULL, "1+null" }, + { 0, NAN, NULL, "null-1" }, + { 0, NAN, NULL, "-null" }, + + { 1, 6, NULL, "2*3"}, + { 1, 6, NULL, "1*2*3"}, + { 0, 0, NULL, "2*0"}, + + { 1, 7, NULL, "(7)"}, + { 1, 7, NULL, "((7))"}, + { 1, 21, NULL, "(1+2)*(3+4)"}, + { 1, 14, NULL, "(4*5)-(-2*-3)"}, + + { 0, NAN, NULL, "2*null"}, + { 0, NAN, NULL, "null/2"}, + { 0, NAN, NULL, "0/0"}, + + { 1, 1, NULL, "(1+2)*3==9"}, + { 1, 1, NULL, "(1+2)*3!=8"}, + { 0, 0, NULL, "(1+2)*3!=9"}, + { 0, 0, NULL, "(1+2)*3==8"}, + + { 0, 0, NULL, "1>2"}, + { 1, 1, NULL, "1<2"}, + { 0, 0, NULL, "3<3"}, + { 0, 0, NULL, "3>3"}, + { 1, 1, NULL, "9<=9"}, + { 1, 1, NULL, "9>=9"}, + { 1, 1, NULL, "2*4==8"}, + { 1, 1, NULL, "16==0x10"}, + { 1, 1, NULL, "15<0x10"}, + { 1, 1, NULL, "17>0x10"}, + { 0, 0, NULL, "2*4!=8"}, + { 1, 1, NULL, "4+2<3+4"}, + { 0, 0, NULL, "4*2<3+4"}, + { 1, 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, 1, NULL, "(1<2) == (3>2)"}, + { 1, 1, NULL, "1<2 == 3>2"}, + + { 0, NAN, NULL, "null <= 0" }, + { 0, NAN, NULL, "null >= 0" }, + { 0, NAN, NULL, "null < 0" }, + { 0, NAN, NULL, "null > 0" }, + { 0, NAN, NULL, "null == null" }, + { 0, NAN, NULL, "null != null" }, + { 0, NAN, NULL, "null < 10" }, + { 0, NAN, NULL, "10 > null" }, + + { 1, 1, NULL, "2 && 1"}, + { 0, 0, NULL, "2 && 0"}, + { 0, 0, NULL, "0 && 2"}, + { 1, 1, NULL, "2 || 1"}, + { 1, 1, NULL, "2 || 0"}, + { 1, 1, NULL, "0 || 2"}, + { 1, 1, NULL, "1 || 2 && 3"}, + { 1, 1, NULL, "2 && 3 || 1"}, + { 1, 1, NULL, "0 && 3 || 2"}, + { 0, 0, NULL, "0 && 3 || 0"}, + { 0, 0, NULL, " 5 - 5 && 1"}, + { 0, 0, NULL, "+5 - 5 && 1"}, + { 0, 0, NULL, "null && 1"}, // null && x == null + { 0, 0, NULL, "1 && null"}, + { 1, 1, NULL, "!null && 1"}, + { 1, 1, NULL, "1 && !null"}, + { 1, 1, NULL, "1 && null-but-true"}, + { 0, 0, NULL, "null || 0"}, // null || 0 == null + { 0, 0, NULL, "0 || null"}, + { 1, 1, NULL, "!null || 0"}, + { 1, 1, NULL, "0 || !null"}, + { 1, 1, NULL, "0 || null-but-true"}, + { 1, 1, NULL, "null || 1"}, // null || 1 == 1 + { 1, 1, NULL, "1 || null"}, + + { 1, 1, NULL, "3 & 1"}, + { 1, 2, NULL, "3 & 2"}, + { 1, 3, NULL, "1 | 2"}, + { 1, 3, NULL, "1 | 3"}, + { 1, 7, NULL, "1 | 6"}, + { 1, 2, NULL, "1 ^ 3"}, + { 0, NAN, NULL, "1 | null"}, + { 0, NAN, NULL, "null | 1"}, + { 0, NAN, NULL, "1 & null"}, + { 0, NAN, NULL, "null & 1"}, + { 0, NAN, NULL, "0 ^ null"}, + { 0, NAN, NULL, "null ^ 0"}, + { 0, NAN, NULL, "1 ^ null"}, + { 0, NAN, NULL, "null ^ 1"}, + + { 1, 1, NULL, "(1^0)&(4^3)"}, + { 1, 2, NULL, "1 ^(0&4)^ 3"}, + { 1, 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 1, 6, NULL, "(1|0)^(4|3)"}, + { 1, 7, NULL, "1 |(0^4)| 3"}, + { 1, 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, 1, NULL, "4 & 2 || 1"}, + { 1, 1, NULL, "(4 & 2) || 1"}, + { 0, 0, NULL, "4 & (2 || 1)"}, + { 1, 1, NULL, "1 || 4 & 2"}, + { 1, 1, NULL, "1 || (4 & 2)"}, + { 0, 0, NULL, "(1 || 4) & 2"}, + + { 1, 1, NULL, " (2*3)&7 > 4"}, + { 0, 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, 1, "plugh", "magic"}, + { 1, 1, "", "empty"}, + { 1, 1, NULL, "magic == \"plugh\""}, + { 1, 1, NULL, "magic != \"xyzzy\""}, + + { 1, 1, NULL, "\"abc\" < \"def\""}, + { 1, 1, NULL, "\"abc\" <= \"abc\""}, + { 0, 0, NULL, "\"abc\" < \"ab\""}, + { 0, 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, 0, NULL, "\"abc\" > \"def\""}, + { 1, 1, NULL, "\"abc\" >= \"abc\""}, + { 1, 1, NULL, "\"abc\" > \"ab\""}, + { 1, 1, NULL, "\"abc\" >= \"ab\""}, + + { 0, NAN, NULL, "null == \"x\"" }, + { 0, NAN, NULL, "null != \"x\"" }, + { 0, NAN, NULL, "null < \"x\"" }, + { 0, NAN, NULL, "null > \"x\"" }, + + { 1, 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + + { 1, 1, "", "empty-but-true" }, + { 0, 0, NULL, "!empty-but-true" }, + { 1, 1, NULL, "!!empty-but-true" }, + { 1, 1, NULL, "1 && empty-but-true && 1" }, + { 0, 0, NULL, "1 && empty-but-true && 0" }, + + { 0, NAN, NULL, "null" }, + { 1, 1, NULL, "!null" }, + { 0, 0, NULL, "!!null", }, + { 0, 0, NULL, "!\"foo\"" }, + { 1, 1, NULL, "!!\"foo\"" }, + + { 1, NAN, NULL, "null-but-true" }, + { 0, 0, NULL, "!null-but-true" }, + { 1, 1, NULL, "!!null-but-true" }, + { 1, 0, NULL, "zero-but-true" }, + { 0, 0, NULL, "!zero-but-true" }, + { 1, 1, NULL, "!!zero-but-true" }, + + { 1, log(2), NULL, "log(2)"}, + { 1, exp(9), NULL, "exp(9)"}, + { 1, 9, NULL, "log(exp(9))"}, + { 1, 8, NULL, "pow(2,3)"}, + { 1, 3, NULL, "sqrt(9)"}, + { 0, NAN, NULL, "sqrt(-9)"}, + + { 1, 2, NULL, "default(2,3)"}, + { 1, 3, NULL, "default(null,3)"}, + { 0, 0, NULL, "default(null,0)"}, + { 1, NAN, NULL, "default(null-but-true,0)"}, + { 1, NAN, NULL, "default(null-but-true,null)"}, + { 1, NAN, NULL, "default(null,null-but-true)"}, + + { 1, 1, NULL, "exists(\"foo\")"}, + { 1, 1, NULL, "exists(12)"}, + { 1, 1, NULL, "exists(\"\")"}, + { 1, 1, NULL, "exists(0)"}, + { 0, 0, NULL, "exists(null)"}, + { 1, 1, NULL, "exists(null-but-true)"}, }; - int i; - hts_expr_val_t r; + int i, res = 0; + hts_expr_val_t r = HTS_EXPR_VAL_INIT; for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { hts_filter_t *filt = hts_filter_init(tests[i].str); if (!filt) return 1; - if (hts_filter_eval(filt, NULL, lookup, &r)) { + if (hts_filter_eval2(filt, NULL, lookup, &r)) { fprintf(stderr, "Failed to parse filter string %s\n", tests[i].str); - return 1; + res = 1; + hts_filter_free(filt); + continue; } - if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0 - || r.d != tests[i].dval)) { - fprintf(stderr, "Failed test: %s == %s, got %s, %f\n", - tests[i].str, tests[i].sval, r.s.s, r.d); - return 1; - } else if (!r.is_str && r.d != tests[i].dval) { - fprintf(stderr, "Failed test: %s == %f, got %f\n", - tests[i].str, tests[i].dval, r.d); - return 1; + if (!hts_expr_val_exists(&r)) { + if (r.is_true != tests[i].truth_val || + !cmpfloat(r.d, tests[i].dval)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%f\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.s.s, r.d); + res = 1; + } + } else if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0 + || !cmpfloat(r.d, tests[i].dval) + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].sval, + r.is_true ? "true" : "false", r.s.s, r.d); + res = 1; + } else if (!r.is_str && (!cmpfloat(r.d, tests[i].dval) + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, "Failed test: %s == %f, got %s, %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.d); + res = 1; } hts_expr_val_free(&r); hts_filter_free(filt); } - return 0; + return res; } int main(int argc, char **argv) { if (argc > 1) { - hts_expr_val_t v; + hts_expr_val_t v = HTS_EXPR_VAL_INIT; hts_filter_t *filt = hts_filter_init(argv[1]); - if (hts_filter_eval(filt, NULL, lookup, &v)) + if (hts_filter_eval2(filt, NULL, lookup, &v)) return 1; + printf("%s\t", v.is_true ? "true":"false"); + if (v.is_str) puts(v.s.s); else diff --git a/test/test_mod.c b/test/test_mod.c index aade3733c..3facf5dba 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -1,6 +1,6 @@ /* test/test_mod.c -- testing of base modification functions - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield @@ -22,6 +22,52 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* +This tests multiple APIs. The simplest is to parse the MM/ML tags with +bam_parse_basemod and then call bam_mods_at_next_pos once for each base in +the bam sequence to check for modifications. + +Ie: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + n = bam_mods_at_next_pos(b, m, mods, 5); + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at seq pos 'i'. + // mods[j].modified_base holds the base mod itself, with + // mods[j].canonical_base, mods[j].strand and mods[j].qual + // also present in hts_base_mod struct. + // ... + } + } + hts_base_mod_state_free(m); + +The extended mode has the same loop above, but calls bam_mods_query_type +to return additional meta-data including the strand, canonical base and +whether the base modification is recorded implicitly or explicitly: + + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + +Looping over every base in the sequence is not particularly efficient +however unless this fits your natural processing order. The alternative +is to call bam_next_base_mod to iterate only over modified locations: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + hts_base_mod_state_free(m); + +*/ + #include #include @@ -41,6 +87,14 @@ static char *code(int id) { int main(int argc, char **argv) { char out[1024] = {0}; + int extended = 0; + + if (argc > 1 && strcmp(argv[1], "-x") == 0) { + extended = 1; + argv++; + argc--; + } + if (argc < 2) return 1; @@ -69,12 +123,31 @@ int main(int argc, char **argv) { n = bam_mods_at_next_pos(b, m, mods, 5); lp += sprintf(lp, "%d\t%c\t", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); - for (j = 0; j < n && j < 5; j++) - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + for (j = 0; j < n && j < 5; j++) { + if (extended) { + int m_strand, m_implicit; + char m_canonical; + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + if (ret < 0 || + m_canonical != mods[j].canonical_base || + m_strand != mods[j].strand) + goto err; + lp += sprintf(lp, "%c%c%s%c%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + mods[j].qual); + } else { + lp += sprintf(lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); + } + } *lp++ = '\n'; *lp++ = 0; @@ -88,17 +161,27 @@ int main(int argc, char **argv) { bam_parse_basemod(b, m); + // List possible mod choices. + int *all_mods; + int all_mods_n = 0; + all_mods = bam_mods_recorded(m, &all_mods_n); + printf("Present:"); + for (i = 0; i < all_mods_n; i++) + printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar('\n'); + int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { char line[8192]={0}, *lp = line; lp += sprintf(lp, "%d\t%c\t", pos, seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); - for (j = 0; j < n && j < 5; j++) + for (j = 0; j < n && j < 5; j++) { lp += sprintf(lp, "%c%c%s%d ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), mods[j].qual); + } *lp++ = '\n'; *lp++ = 0; diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c new file mode 100644 index 000000000..0e0512988 --- /dev/null +++ b/test/test_time_funcs.c @@ -0,0 +1,125 @@ +/* test_time_compat.c -- Test time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../hts_time_funcs.h" + +int test_normalised(time_t start, time_t end, time_t incr) { + time_t i, j; + struct tm *utc; + + for (i = start; i < end; i += incr) { + utc = gmtime(&i); + j = hts_time_gm(utc); + if (i != j) { + fprintf(stderr, + "hts_time_gm() failed, got %"PRId64" expected %"PRId64"\n", + (int64_t) j, (int64_t) i); + return 1; + } + } + return 0; +} + +int test_specific(int year, int mon, int mday, int hour, int min, int sec, + time_t expected) { + struct tm utc = { sec, min, hour, mday, mon - 1, year - 1900, 0, 0, 0 }; + time_t res = hts_time_gm(&utc); + if (res != expected) { + fprintf(stderr, + "hts_time_gm() failed for %4d/%02d/%02d %02d:%02d:%02d :" + " got %"PRId64" expected %"PRId64"\n", + year, mon, mday, hour, min, sec, + (int64_t) res, (int64_t) expected); + return 1; + } + return 0; +} + +int main(int argc, char **argv) { + int res = 0; + + if (test_normalised(0, INT_MAX - 1000, 1000) != 0) + return EXIT_FAILURE; + if (sizeof(time_t) >= 8) { + if (test_normalised(INT_MAX - 1000, + (time_t)((int64_t) INT_MAX * 2), 1000) != 0) + return EXIT_FAILURE; + } + + // 2022-06-14 12:32:10 + res |= test_specific(2022, 6, 14, 12, 32, 10, 1655209930); + // 2022-06-14 12:32:10 + res |= test_specific(1993, 9, 10514, 12, 32, 10, 1655209930); + // 2022-02-28 12:00:00 + res |= test_specific(2020, 2, 28, 12, 0, 0, 1582891200); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 2, 29, 12, 0, 0, 1582977600); + // 2022-03-01 12:00:00 + res |= test_specific(2020, 2, 30, 12, 0, 0, 1583064000); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 3, 0, 12, 0, 0, 1582977600); + // 2020-02-01 12:00:00 + res |= test_specific(2019, 14, 1, 12, 0, 0, 1580558400); + // 2020-03-01 12:00:00 + res |= test_specific(2019, 15, 1, 12, 0, 0, 1583064000); + // 2021-03-01 12:00:00 + res |= test_specific(2019, 27, 1, 12, 0, 0, 1614600000); + // 2024-02-01 12:00:00 + res |= test_specific(2019, 62, 1, 12, 0, 0, 1706788800); + // 2024-03-01 12:00:00 + res |= test_specific(2019, 63, 1, 12, 0, 0, 1709294400); + // 2020-12-31 23:59:59 + res |= test_specific(2021, 0, 31, 23, 59, 59, 1609459199); + // 2020-03-01 12:00:00 + res |= test_specific(2021, -9, 1, 12, 0, 0, 1583064000); + // 2020-02-01 12:00:00 + res |= test_specific(2021, -10, 1, 12, 0, 0, 1580558400); + // 2019-02-01 12:00:00 + res |= test_specific(2021, -22, 1, 12, 0, 0, 1549022400); + // 1970-01-01 00:00:00 + res |= test_specific(1970, 1, 1, 0, 0, 0, 0); + // 2038-01-19 03:14:07 + res |= test_specific(1970, 1, 1, 0, 0, INT_MAX, INT_MAX); + // 2038-01-19 03:14:07 + res |= test_specific(2038, 1, 19, 3, 14, 7, INT_MAX); + if (sizeof(time_t) < 8) { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1); + } else { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, + (time_t)((int64_t) INT_MAX + 1)); + } + + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/textutils_internal.h b/textutils_internal.h index 4b120bdbc..1ad096494 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -65,9 +65,11 @@ typedef struct hts_json_token hts_json_token; /// Allocate an empty JSON token structure, for use with hts_json_* functions /** @return An empty token on success; NULL on failure */ +HTSLIB_EXPORT hts_json_token *hts_json_alloc_token(void); /// Free a JSON token +HTSLIB_EXPORT void hts_json_free_token(hts_json_token *token); /// Accessor function to get JSON token type @@ -85,6 +87,7 @@ as follows: - `!` other errors (e.g. out of memory) - `\0` terminator at end of input */ +HTSLIB_EXPORT char hts_json_token_type(hts_json_token *token); /// Accessor function to get JSON token in string form @@ -98,6 +101,7 @@ will point at the kstring_t buffer passed as the third parameter to hts_json_fnext(). In that case, the value will only be valid until the next call to hts_json_fnext(). */ +HTSLIB_EXPORT char *hts_json_token_str(hts_json_token *token); /// Read one JSON token from a string @@ -111,6 +115,7 @@ is modified by having token-terminating characters overwritten as NULs. The `state` argument records the current position within `str` after each `hts_json_snext()` call, and should be set to 0 before the first call. */ +HTSLIB_EXPORT char hts_json_snext(char *str, size_t *state, hts_json_token *token); /// Read and discard a complete JSON value from a string @@ -123,6 +128,7 @@ char hts_json_snext(char *str, size_t *state, hts_json_token *token); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_sskip_value(char *str, size_t *state, char type); struct hFILE; @@ -137,6 +143,7 @@ The `kstr` buffer is used to store the string value of the token read, so `token->str` is only valid until the next time `hts_json_fnext()` is called with the same `kstr` argument. */ +HTSLIB_EXPORT char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); /// Read and discard a complete JSON value from a file @@ -148,6 +155,7 @@ char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_fskip_value(struct hFILE *fp, char type); // The functions operate on ints such as are returned by fgetc(), diff --git a/vcf.c b/vcf.c index d83c30e3f..56af63054 100644 --- a/vcf.c +++ b/vcf.c @@ -930,6 +930,8 @@ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, co } else if ( type==BCF_HL_STR ) { + if (!str_class) + return NULL; for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; @@ -1411,6 +1413,7 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); + if ( v->pos==UINT32_MAX ) v->pos = -1; // this is for telomere coordinate, e.g. MT:0 v->rlen = le_to_i32(x + 16); v->qual = le_to_float(x + 20); v->n_info = le_to_u16(x + 24); @@ -1596,7 +1599,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type)); err |= BCF_ERR_TAG_INVALID; @@ -1620,7 +1624,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } @@ -2232,10 +2237,12 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) } while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros int ret; - if ( fp->format.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l); - else + if (bgzf_flush(fp->fp.bgzf) != 0) return -1; + } else { ret = hwrite(fp->fp.hfile, htxt.s, htxt.l); + } free(htxt.s); return ret<0 ? -1 : 0; } @@ -3407,10 +3414,13 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) fp->line.l = 0; if (vcf_format1(h, v, &fp->line) != 0) return -1; - if ( fp->format.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); - else + } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + } if (fp->idx) { int tid; @@ -4172,19 +4182,26 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t return; } + // Catch "joined before" breakend case + if ( alt[0]==']' || alt[0] == '[' ) + { + var->type = VCF_BND; return; + } + + // Iterate through alt characters that match the reference const char *r = ref, *a = alt; while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed if ( *a && !*r ) { - if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } + if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( *a ) a++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return; } else if ( *r && !*a ) { while ( *r ) r++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return; } else if ( !*r && !*a ) { @@ -4199,13 +4216,13 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t { if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } var->n = -(re-r); - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; } var->type = VCF_OTHER; return; } else if ( re==r ) { var->n = ae-a; - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; } var->type = VCF_OTHER; return; } @@ -4221,7 +4238,10 @@ static int bcf_set_variant_types(bcf1_t *b) bcf_dec_t *d = &b->d; if ( d->n_var < b->n_allele ) { - d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + if (!new_var) + return -1; + d->var = new_var; d->n_var = b->n_allele; } int i; @@ -4237,15 +4257,80 @@ static int bcf_set_variant_types(bcf1_t *b) return 0; } +// bcf_get_variant_type/bcf_get_variant_types should only return the following, +// to be compatible with callers that are not expecting newer values +// like VCF_INS, VCF_DEL. The full set is available from the newer +// vcf_has_variant_type* interfaces. +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) int bcf_get_variant_types(bcf1_t *rec) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var_type; + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + return rec->d.var_type & ORIG_VAR_TYPES; } + int bcf_get_variant_type(bcf1_t *rec, int ith_allele) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var[ith_allele].type; + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) { + hts_log_error("Requested allele outside valid range"); + exit(1); + } + return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; +} +#undef ORIG_VAR_TYPES + +int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1; + if (bitmask == VCF_REF) { // VCF_REF is 0, so handled as a special case + return rec->d.var[ith_allele].type == VCF_REF; + } + return bitmask & rec->d.var[ith_allele].type; +} + +int bcf_variant_length(bcf1_t *rec, int ith_allele) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing; + return rec->d.var[ith_allele].n; +} + +int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, + enum bcf_variant_match mode) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + uint32_t type = rec->d.var_type; + if ( mode==bcf_match_overlap ) return bitmask & type; + + // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may + // ask for say `VCF_INS` or `VCF_INDEL` only + if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL; + else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL); + + if ( mode==bcf_match_subset ) + { + if ( ~bitmask & type ) return 0; + else return bitmask & type; + } + // mode == bcf_match_exact + return type==bitmask ? type : 0; } int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) diff --git a/version.sh b/version.sh index 6e6eff016..89d57fcf6 100755 --- a/version.sh +++ b/version.sh @@ -3,7 +3,7 @@ # # Author : James Bonfield # -# Copyright (C) 2017-2018 Genome Research Ltd. +# Copyright (C) 2017-2018, 2021 Genome Research Ltd. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.15.1 +VERSION=1.16 # If we have a git clone, then check against the current tag srcdir=${0%/version.sh}