Skip to content

Commit

Permalink
Release 1.7: long CIGAR support, multi-region iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
daviesrob committed Jan 26, 2018
2 parents 746549d + 1b2c3b4 commit 209f94b
Show file tree
Hide file tree
Showing 52 changed files with 2,536 additions and 475 deletions.
4 changes: 0 additions & 4 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ version: 'vers.{build}'

# branches to build
branches:
# Whitelist
only:
- develop

# Blacklist
except:
- gh-pages
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ lib*.so.*
/test/sam
/test/tabix/*.tmp.*
/test/tabix/FAIL*
/test/test-bcf-sr
/test/test-bcf-translate
/test/test_bgzf
/test/test-regidx
/test/test-vcf-api
/test/test-vcf-sweep
/test/test-bcf-sr
/test/test_view
/test/thrash_threads[1-6]
/test/*.tmp
Expand Down
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ addons:

# For MacOSX systems
before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update ; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install xz; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ); fi

script:
- if test "$USE_CONFIG" = "yes" ; then autoreconf && ./configure ; fi && make -e && make test
11 changes: 7 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,9 @@ cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h $(
cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h $(htslib_khash_h)
cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
hts_internal_h = hts_internal.h $(htslib_hts_h)
hfile_internal_h = hfile_internal.h $(htslib_hfile_h) $(textutils_internal_h)
hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h)
textutils_internal_h = textutils_internal.h $(htslib_kstring_h)
thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h)


Expand All @@ -195,6 +196,7 @@ config.h:
echo '/* Default config.h generated by Makefile */' > $@
echo '#define HAVE_LIBBZ2 1' >> $@
echo '#define HAVE_LIBLZMA 1' >> $@
echo '#define HAVE_LZMA_H 1' >> $@
echo '#define HAVE_FSEEKO 1' >> $@
echo '#define HAVE_DRAND48 1' >> $@

Expand Down Expand Up @@ -293,8 +295,9 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts
hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h)
hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h)
hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hts_internal_h) $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(hfile_internal_h) $(htslib_hfile_h) version.h $(hts_internal_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h)
hts_os.o hts_os.pico: hts_os.c config.h os/rand.c
vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_hts_endian_h)
tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(hts_internal_h) $(htslib_khash_h)
Expand All @@ -317,7 +320,7 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h)
cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(htslib_hts_endian_h)
cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h)
cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h)
cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h)
cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h)
cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h)
Expand Down
42 changes: 42 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@
Noteworthy changes in release 1.7 (26th January 2018)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* BAM: HTSlib now supports BAMs which include CIGARs with more than
65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8).

* BCF/VCF:
- Removed the need for long double in pileup calculations.
- Sped up the synced reader in some situations.
- Bug fixing: removed memory leak in bcf_copy.

* CRAM:
- Added support for HTS_IDX_START in cram iterators.
- Easier to build when lzma header files are absent.
- Bug fixing: a region query with REQUIRED_FIELDS option to
disable sequence retrieval now gives correct results.
- Bug fixing: stop queries to regions starting after the last
read on a chromosome from incorrectly reporting errors
(#651, #653; reported by Imran Haque and @egafni via pysam).

* Multi-region iterator: The new structure takes a list of regions and
iterates over all, deduplicating reads in the process, and producing a
full list of file offset intervals. This is usually much faster than
repeatedly using the old single-region iterator on a series of regions.

* Curl improvements:
- Add Bearer token support via HTS_AUTH_LOCATION env (#600).
- Use CURL_CA_BUNDLE environment variable to override the CA (#622;
thanks to Garret Kelly & David Alexander).
- Speed up (removal of excessive waiting) for both http(s) and ftp.
- Avoid repeatedly reconnecting by removal of unnecessary seeks.
- Bug fixing: double free when libcurl_open fails.

* BGZF block caching, if enabled, now performs far better (#629; reported
by Ram Yalamanchili).

* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas
Hickman).

* Tidied up the drand48 support (intended for systems that do not
provide this function).

Noteworthy changes in release 1.6 (28th September 2017)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
55 changes: 34 additions & 21 deletions bcf_sr_sort.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,20 @@ char *grp_create_key(sr_sort_t *srt)
}
return ret;
}
int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
{
hts_expand(int,idx+1,srt->mactive,srt->active);
srt->nactive = 1;
srt->active[srt->nactive - 1] = idx;
return 0;
}
int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
{
hts_expand(int,idx+1,srt->mactive,srt->active);
srt->nactive++;
srt->active[srt->nactive - 1] = idx;
return 0;
}
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
{
if ( !srt->grp_str2int )
Expand Down Expand Up @@ -357,11 +371,11 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
memset(&grp,0,sizeof(grp_t));

// group VCFs into groups, each with a unique combination of variants in the duplicate lines
int ireader,ivar,irec,igrp,ivset;
for (ireader=0; ireader<readers->nreaders; ireader++)
int ireader,ivar,irec,igrp,ivset,iact;
for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
for (iact=0; iact<srt->nactive; iact++)
{
srt->vcf_buf[ireader].nrec = 0;

ireader = srt->active[iact];
bcf_sr_t *reader = &readers->readers[ireader];
int rid = bcf_hdr_name2id(reader->header, chr);
grp.nvar = 0;
Expand Down Expand Up @@ -546,23 +560,8 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
{
int i,j;
if ( readers->nreaders == 1 )
{
srt->nsr = 1;
if ( !srt->msr )
{
srt->vcf_buf = (vcf_buf_t*) calloc(1,sizeof(vcf_buf_t)); // first time here
srt->msr = 1;
}
bcf_sr_t *reader = &readers->readers[0];
assert( reader->buffer[1]->pos==min_pos );
bcf1_t *tmp = reader->buffer[0];
for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
reader->buffer[ reader->nbuffer ] = tmp;
reader->nbuffer--;
readers->has_line[0] = 1;
return 1;
}
assert( srt->nactive>0 );

if ( srt->nsr != readers->nreaders )
{
srt->sr = readers;
Expand All @@ -575,6 +574,19 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int mi
srt->nsr = readers->nreaders;
srt->chr = NULL;
}
if ( srt->nactive == 1 )
{
if ( readers->nreaders>1 )
memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
bcf_sr_t *reader = &readers->readers[srt->active[0]];
assert( reader->buffer[1]->pos==min_pos );
bcf1_t *tmp = reader->buffer[0];
for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
reader->buffer[ reader->nbuffer ] = tmp;
reader->nbuffer--;
readers->has_line[srt->active[0]] = 1;
return 1;
}
if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);

if ( !srt->vcf_buf[0].nrec ) return 0;
Expand Down Expand Up @@ -629,6 +641,7 @@ sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
}
void bcf_sr_sort_destroy(sr_sort_t *srt)
{
free(srt->active);
if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
int i;
Expand Down
3 changes: 3 additions & 0 deletions bcf_sr_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,14 @@ typedef struct
const char *chr;
int pos, nsr, msr;
int pair;
int nactive, mactive, *active; // list of readers with lines at the current pos
}
sr_sort_t;

sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt);
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos);
int bcf_sr_sort_set_active(sr_sort_t *srt, int i);
int bcf_sr_sort_add_active(sr_sort_t *srt, int i);
void bcf_sr_sort_destroy(sr_sort_t *srt);
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i);

Expand Down
70 changes: 52 additions & 18 deletions bgzf.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ typedef struct {
uint8_t *block;
int64_t end_offset;
} cache_t;

#include "htslib/khash.h"
KHASH_MAP_INIT_INT64(cache, cache_t)
#endif

struct bgzf_cache_t {
khash_t(cache) *h;
khint_t last_pos;
};

#ifdef BGZF_MT

typedef struct bgzf_job {
Expand Down Expand Up @@ -215,7 +221,16 @@ static BGZF *bgzf_read_init(hFILE *hfpr)
fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b);
fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
#ifdef BGZF_CACHE
fp->cache = kh_init(cache);
if (!(fp->cache = malloc(sizeof(*fp->cache)))) {
free(fp);
return NULL;
}
if (!(fp->cache->h = kh_init(cache))) {
free(fp->cache);
free(fp);
return NULL;
}
fp->cache->last_pos = 0;
#endif
return fp;
}
Expand Down Expand Up @@ -524,27 +539,27 @@ static int check_header(const uint8_t *header)
static void free_cache(BGZF *fp)
{
khint_t k;
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
if (fp->is_write) return;
khash_t(cache) *h = fp->cache->h;
for (k = kh_begin(h); k < kh_end(h); ++k)
if (kh_exist(h, k)) free(kh_val(h, k).block);
kh_destroy(cache, h);
free(fp->cache);
}

static int load_block_from_cache(BGZF *fp, int64_t block_address)
{
khint_t k;
cache_t *p;

khash_t(cache) *h = (khash_t(cache)*)fp->cache;
khash_t(cache) *h = fp->cache->h;
k = kh_get(cache, h, block_address);
if (k == kh_end(h)) return 0;
p = &kh_val(h, k);
if (fp->block_length != 0) fp->block_offset = 0;
fp->block_address = block_address;
fp->block_length = p->size;
// FIXME: why BGZF_MAX_BLOCK_SIZE and not p->size?
memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
memcpy(fp->uncompressed_block, p->block, p->size);
if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
{
// todo: move the error up
Expand All @@ -557,29 +572,48 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address)
static void cache_block(BGZF *fp, int size)
{
int ret;
khint_t k;
khint_t k, k_orig;
uint8_t *block = NULL;
cache_t *p;
//fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address);
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
khash_t(cache) *h = fp->cache->h;
if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return;
if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
/* A better way would be to remove the oldest block in the
* cache, but here we remove a random one for simplicity. This
* should not have a big impact on performance. */
for (k = kh_begin(h); k < kh_end(h); ++k)
if (kh_exist(h, k)) break;
if (k < kh_end(h)) {
free(kh_val(h, k).block);
/* Remove uniformly from any position in the hash by a simple
* round-robin approach. An alternative strategy would be to
* remove the least recently accessed block, but the round-robin
* removal is simpler and is not expected to have a big impact
* on performance */
if (fp->cache->last_pos >= kh_end(h)) fp->cache->last_pos = kh_begin(h);
k_orig = k = fp->cache->last_pos;
if (++k >= kh_end(h)) k = kh_begin(h);
while (k != k_orig) {
if (kh_exist(h, k))
break;
if (++k == kh_end(h))
k = kh_begin(h);
}
fp->cache->last_pos = k;

if (k != k_orig) {
block = kh_val(h, k).block;
kh_del(cache, h, k);
}
} else {
block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
}
if (!block) return;
k = kh_put(cache, h, fp->block_address, &ret);
if (ret == 0) return; // if this happens, a bug!
if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen)
free(block);
return;
}
p = &kh_val(h, k);
p->size = fp->block_length;
p->end_offset = fp->block_address + size;
p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
p->block = block;
memcpy(p->block, fp->uncompressed_block, p->size);
}
#else
static void free_cache(BGZF *fp) {}
Expand Down Expand Up @@ -1489,7 +1523,7 @@ int bgzf_close(BGZF* fp)

void bgzf_set_cache_size(BGZF *fp, int cache_size)
{
if (fp) fp->cache_size = cache_size;
if (fp && fp->cache) fp->cache_size = cache_size;
}

int bgzf_check_EOF(BGZF *fp) {
Expand Down
4 changes: 2 additions & 2 deletions bgzip.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* bgzip.c -- Block compression/decompression utility.
Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
Copyright (C) 2010, 2013-2017 Genome Research Ltd.
Copyright (C) 2010, 2013-2018 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -130,7 +130,7 @@ int main(int argc, char **argv)
case 1:
printf(
"bgzip (htslib) %s\n"
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
"Copyright (C) 2018 Genome Research Ltd.\n", hts_version());
return EXIT_SUCCESS;
case 'h':
case '?': return bgzip_main_usage();
Expand Down
Loading

0 comments on commit 209f94b

Please sign in to comment.