Skip to content

Commit 9913ead

Browse files
Merge pull request #9 from jeromekelleher/handle-full-missing-fields
Handle full missing fields
2 parents 8db9718 + 69bd48e commit 9913ead

File tree

6 files changed

+323
-98
lines changed

6 files changed

+323
-98
lines changed

lib/tests.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,16 @@ test_variant_encoder_minimal(void)
121121
const char alt_data[] = "TC";
122122
const int32_t qual_data[] = { 1000, 12 };
123123
const char filter_data[] = "PASSPASS";
124-
const int32_t an_data[] = { 8, 9 };
125-
const char* aa_data = "GT";
124+
const int32_t an_data[] = { -1, 9 };
125+
const char* aa_data = "G.";
126126
const int32_t gt_data[] = { 0, 0, 0, 1, 1, 1, 1, 0 };
127127
const int8_t gt_phased_data[] = { 0, 1, 1, 0 };
128-
const int32_t hq_data[] = { 10, 15, 7, 12, 8, 9, 10, 11 };
128+
const int32_t hq_data[] = { 10, 15, 7, 12, -1, -1, -1, -1};
129129
int ret, j;
130130
vcz_variant_encoder_t writer;
131131
const char *expected[] = {
132-
"X\t123\tRS1\tA\tT\t1000\tPASS\tAN=8;AA=G\tFORMAT=GT:HQ\t0/0:10,15\t0|1:7,12",
133-
"YY\t45678\tRS2\tG\tC\t12\tPASS\tAN=9;AA=T\tFORMAT=GT:HQ\t1|1:8,9\t1/0:10,11",
132+
"X\t123\tRS1\tA\tT\t1000\tPASS\tAA=G\tGT:HQ\t0/0:10,15\t0|1:7,12",
133+
"YY\t45678\tRS2\tG\tC\t12\tPASS\tAN=9\tGT\t1|1\t1/0",
134134
};
135135
char buf[1000];
136136

@@ -149,7 +149,7 @@ test_variant_encoder_minimal(void)
149149

150150
printf("\n");
151151
vcz_variant_encoder_print_state(&writer, _devnull);
152-
vcz_variant_encoder_print_state(&writer, stdout);
152+
/* vcz_variant_encoder_print_state(&writer, stdout); */
153153

154154
for (j = 0; j < num_rows; j++) {
155155
ret = vcz_variant_encoder_write_row(&writer, j, buf, 1000);

lib/vcf_encoder.c

Lines changed: 210 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,32 @@ vcz_itoa(int32_t value, char *buf)
5555
return p;
5656
}
5757

58+
static bool
59+
int32_all_missing(const int32_t *restrict data, size_t n)
60+
{
61+
size_t j;
62+
63+
for (j = 0; j < n; j++) {
64+
if (data[j] != VCZ_INT_FILL && data[j] != VCZ_INT_MISSING) {
65+
return false;
66+
}
67+
}
68+
return true;
69+
}
70+
71+
static bool
72+
string_all_missing(const char *restrict data, size_t item_size, size_t n)
73+
{
74+
size_t j;
75+
76+
for (j = 0; j < n * item_size; j++) {
77+
if (data[j] != VCZ_STRING_FILL && data[j] != VCZ_STRING_MISSING) {
78+
return false;
79+
}
80+
}
81+
return true;
82+
}
83+
5884
static int64_t
5985
string_field_write_entry(
6086
const vcz_field_t *self, const void *data, char *dest, size_t buflen, int64_t offset)
@@ -144,36 +170,46 @@ vcz_field_write(
144170
return vcz_field_write_entry(self, data, dest, buflen, offset);
145171
}
146172

147-
void
148-
vcz_field_print_state(const vcz_field_t *self, FILE *out)
173+
static bool
174+
vcz_info_field_is_missing(const vcz_field_t *self, size_t variant)
149175
{
150-
fprintf(out, "\t%s\ttype:%d\titem_size=%d\tnum_columns=%d\tdata=%p\n", self->name,
151-
self->type, (int) self->item_size, (int) self->num_columns, self->data);
176+
177+
size_t row_size = self->num_columns * self->item_size;
178+
const void *data = self->data + variant * row_size;
179+
180+
if (self->type == VCZ_TYPE_INT) {
181+
if (self->item_size == 4) {
182+
return int32_all_missing(data, self->num_columns);
183+
}
184+
} else if (self->type == VCZ_TYPE_STRING) {
185+
return string_all_missing(data, self->item_size, self->num_columns);
186+
}
187+
assert(false);
188+
return false;
152189
}
153190

154-
int64_t
155-
vcz_variant_encoder_write_format_specifiers(
156-
const vcz_variant_encoder_t *self, char *dest, size_t buflen, int64_t offset)
191+
static bool
192+
vcz_format_field_is_missing(const vcz_field_t *self, size_t variant, size_t num_samples)
157193
{
158-
const int format_len = 7;
159-
size_t j;
194+
size_t row_size = self->num_columns * self->item_size * num_samples;
195+
const void *data = self->data + variant * row_size;
160196

161-
strcpy(dest + offset, "FORMAT=");
162-
offset += format_len;
163-
if (self->gt.data != NULL) {
164-
strcpy(dest + offset, "GT");
165-
offset += 2;
166-
}
167-
for (j = 0; j < self->num_format_fields; j++) {
168-
dest[offset] = ':';
169-
offset++;
170-
strcpy(dest + offset, self->format_fields[j].name);
171-
offset += strlen(self->format_fields[j].name);
197+
if (self->type == VCZ_TYPE_INT) {
198+
if (self->item_size == 4) {
199+
return int32_all_missing(data, self->num_columns * num_samples);
200+
}
201+
} else if (self->type == VCZ_TYPE_STRING) {
202+
return string_all_missing(data, self->item_size, self->num_columns * num_samples);
172203
}
173-
dest[offset] = '\t';
174-
offset++;
175-
dest[offset] = '\0';
176-
return offset;
204+
assert(false);
205+
return false;
206+
}
207+
208+
void
209+
vcz_field_print_state(const vcz_field_t *self, FILE *out)
210+
{
211+
fprintf(out, "\t%s\ttype:%d\titem_size=%d\tnum_columns=%d\tdata=%p\n", self->name,
212+
self->type, (int) self->item_size, (int) self->num_columns, self->data);
177213
}
178214

179215
int64_t
@@ -219,98 +255,194 @@ vcz_variant_encoder_write_sample_gt(const vcz_variant_encoder_t *self, size_t va
219255
return offset;
220256
}
221257

258+
/* int64_t */
259+
/* vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, */
260+
/* size_t variant, size_t sample, char *dest, size_t buflen, int64_t offset) */
261+
/* { */
262+
/* vcz_field_t field; */
263+
/* size_t j, row_size; */
264+
/* const void *data; */
265+
266+
/* if (self->gt.data != NULL) { */
267+
/* offset = vcz_variant_encoder_write_sample_gt( */
268+
/* self, variant, sample, dest, buflen, offset); */
269+
/* if (offset < 0) { */
270+
/* goto out; */
271+
/* } */
272+
/* } */
273+
274+
/* for (j = 0; j < self->num_format_fields; j++) { */
275+
/* field = self->format_fields[j]; */
276+
/* dest[offset - 1] = ':'; */
277+
/* row_size = self->num_samples * field.num_columns * field.item_size; */
278+
/* data = field.data + variant * row_size */
279+
/* + sample * field.num_columns * field.item_size; */
280+
/* offset = vcz_field_write_entry(&field, data, dest, buflen, offset); */
281+
/* if (offset < 0) { */
282+
/* goto out; */
283+
/* } */
284+
/* } */
285+
/* out: */
286+
/* return offset; */
287+
/* } */
288+
222289
int64_t
223-
vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self,
224-
size_t variant, size_t sample, char *dest, size_t buflen, int64_t offset)
290+
vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t variant,
291+
char *dest, size_t buflen, int64_t offset)
225292
{
226293
vcz_field_t field;
227-
size_t j, row_size;
228-
const void *data;
229-
230-
if (self->gt.data != NULL) {
231-
offset = vcz_variant_encoder_write_sample_gt(
232-
self, variant, sample, dest, buflen, offset);
233-
if (offset < 0) {
294+
size_t j;
295+
bool *missing = NULL;
296+
bool all_missing = true;
297+
bool first_field;
298+
299+
if (self->num_info_fields > 0) {
300+
missing = malloc(self->num_info_fields * sizeof(*missing));
301+
if (missing == NULL) {
302+
offset = VCZ_ERR_NO_MEMORY;
234303
goto out;
235304
}
305+
for (j = 0; j < self->num_info_fields; j++) {
306+
missing[j] = vcz_info_field_is_missing(&self->info_fields[j], variant);
307+
if (!missing[j]) {
308+
all_missing = false;
309+
}
310+
}
236311
}
237312

238-
for (j = 0; j < self->num_format_fields; j++) {
239-
field = self->format_fields[j];
240-
dest[offset - 1] = ':';
241-
row_size = self->num_samples * field.num_columns * field.item_size;
242-
data = field.data + variant * row_size
243-
+ sample * field.num_columns * field.item_size;
244-
offset = vcz_field_write_entry(&field, data, dest, buflen, offset);
245-
if (offset < 0) {
246-
goto out;
313+
if (all_missing) {
314+
dest[offset] = '.';
315+
offset++;
316+
dest[offset] = '\t';
317+
offset++;
318+
} else {
319+
first_field = true;
320+
for (j = 0; j < self->num_info_fields; j++) {
321+
if (!missing[j]) {
322+
if (!first_field) {
323+
dest[offset - 1] = ';';
324+
}
325+
first_field = false;
326+
field = self->info_fields[j];
327+
memcpy(dest + offset, field.name, field.name_length);
328+
offset += field.name_length;
329+
dest[offset] = '=';
330+
offset++;
331+
offset = vcz_field_write(&field, variant, dest, buflen, offset);
332+
if (offset < 0) {
333+
goto out;
334+
}
335+
}
247336
}
248337
}
249338
out:
339+
if (missing != NULL) {
340+
free(missing);
341+
}
250342
return offset;
251343
}
252344

253-
int64_t
254-
vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t variant,
255-
char *dest, size_t buflen, int64_t offset)
345+
346+
static int64_t
347+
vcz_variant_encoder_write_format_fields(
348+
const vcz_variant_encoder_t *self, size_t variant, char *buf, size_t buflen, int64_t offset)
256349
{
350+
size_t j, sample, row_size;
257351
vcz_field_t field;
258-
size_t j;
352+
bool *missing = NULL;
353+
bool all_missing = true;
354+
bool has_gt = (self->gt.data != NULL);
355+
bool gt_missing = true;
356+
const size_t num_samples = self->num_samples;
357+
const void *data;
259358

260-
if (self->num_info_fields == 0) {
261-
dest[offset] = '.';
262-
offset++;
263-
dest[offset] = '\t';
264-
offset++;
359+
if (has_gt) {
360+
gt_missing = vcz_format_field_is_missing(&self->gt, variant, num_samples);
265361
}
266-
for (j = 0; j < self->num_info_fields; j++) {
267-
if (j > 0) {
268-
dest[offset - 1] = ';';
269-
}
270-
field = self->info_fields[j];
271-
memcpy(dest + offset, field.name, field.name_length);
272-
offset += field.name_length;
273-
dest[offset] = '=';
274-
offset++;
275-
offset = vcz_field_write(&field, variant, dest, buflen, offset);
276-
if (offset < 0) {
362+
363+
if (self->num_format_fields > 0) {
364+
missing = malloc(self->num_format_fields * sizeof(*missing));
365+
if (missing == NULL) {
366+
offset = VCZ_ERR_NO_MEMORY;
277367
goto out;
278368
}
369+
for (j = 0; j < self->num_format_fields; j++) {
370+
missing[j] = vcz_format_field_is_missing(&self->format_fields[j], variant, num_samples);
371+
if (!missing[j]) {
372+
all_missing = false;
373+
}
374+
}
375+
}
376+
all_missing = all_missing && gt_missing;
377+
378+
if (! all_missing) {
379+
380+
if (!gt_missing) {
381+
strcpy(buf + offset, "GT:");
382+
offset += 3;
383+
}
384+
for (j = 0; j < self->num_format_fields; j++) {
385+
if (!missing[j]) {
386+
strcpy(buf + offset, self->format_fields[j].name);
387+
offset += self->format_fields[j].name_length;
388+
buf[offset] = ':';
389+
offset++;
390+
}
391+
}
392+
buf[offset - 1] = '\t';
393+
394+
for (sample = 0; sample < num_samples; sample++) {
395+
if (!gt_missing) {
396+
offset = vcz_variant_encoder_write_sample_gt(
397+
self, variant, sample, buf, buflen, offset);
398+
if (offset < 0) {
399+
goto out;
400+
}
401+
buf[offset - 1] = ':';
402+
}
403+
for (j = 0; j < self->num_format_fields; j++) {
404+
if (!missing[j]) {
405+
field = self->format_fields[j];
406+
row_size = num_samples * field.num_columns * field.item_size;
407+
data = field.data + variant * row_size
408+
+ sample * field.num_columns * field.item_size;
409+
offset = vcz_field_write_entry(&field, data, buf, buflen, offset);
410+
if (offset < 0) {
411+
goto out;
412+
}
413+
buf[offset - 1] = ':';
414+
}
415+
}
416+
buf[offset - 1] = '\t';
417+
}
279418
}
280419
out:
420+
if (missing != NULL) {
421+
free(missing);
422+
}
281423
return offset;
282424
}
283425

284426
int64_t
285427
vcz_variant_encoder_write_row(
286-
const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen)
428+
const vcz_variant_encoder_t *self, size_t variant, char *buf, size_t buflen)
287429
{
288430
int64_t offset = 0;
289431
size_t j;
290432

291433
for (j = 0; j < VCZ_NUM_FIXED_FIELDS; j++) {
292-
offset = vcz_field_write(&self->fixed_fields[j], row, buf, buflen, offset);
434+
offset = vcz_field_write(&self->fixed_fields[j], variant, buf, buflen, offset);
293435
if (offset < 0) {
294436
goto out;
295437
}
296438
}
297-
offset = vcz_variant_encoder_write_info_fields(self, row, buf, buflen, offset);
439+
offset = vcz_variant_encoder_write_info_fields(self, variant, buf, buflen, offset);
298440
if (offset < 0) {
299441
goto out;
300442
}
301-
if (self->num_samples > 0) {
302-
offset = vcz_variant_encoder_write_format_specifiers(self, buf, buflen, offset);
303-
if (offset < 0) {
304-
goto out;
305-
}
306-
for (j = 0; j < self->num_samples; j++) {
307-
/* printf("Run sample %d\n", (int) j); */
308-
offset = vcz_variant_encoder_write_format_fields(
309-
self, row, j, buf, buflen, offset);
310-
if (offset < 0) {
311-
goto out;
312-
}
313-
}
443+
offset = vcz_variant_encoder_write_format_fields(self, variant, buf, buflen, offset);
444+
if (offset < 0) {
445+
goto out;
314446
}
315447
offset--;
316448
buf[offset] = '\0';

lib/vcf_encoder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
#define VCZ_INT_MISSING -1
66
#define VCZ_INT_FILL -2
7+
#define VCZ_STRING_MISSING '.'
8+
#define VCZ_STRING_FILL '\0'
79

810
#define VCZ_NUM_FIXED_FIELDS 7
911

0 commit comments

Comments
 (0)