Skip to content

Commit 23e3e33

Browse files
Improve testing
1 parent b441995 commit 23e3e33

File tree

4 files changed

+97
-49
lines changed

4 files changed

+97
-49
lines changed

lib/tests.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,43 @@ test_encode_plink_single_genotype_vary_a12(void)
957957
}
958958
}
959959

960+
static void
961+
test_encode_plink_all_zeros_instance(size_t num_variants, size_t num_samples)
962+
{
963+
int8_t *genotypes = calloc(num_variants * num_samples, 2);
964+
int8_t *a12 = calloc(num_variants, 2);
965+
char *buf = malloc(num_variants * num_samples / 4);
966+
size_t j;
967+
968+
CU_ASSERT_FATAL(num_samples % 4 == 0);
969+
CU_ASSERT_FATAL(genotypes != NULL);
970+
CU_ASSERT_FATAL(a12 != NULL);
971+
CU_ASSERT_FATAL(buf != NULL);
972+
973+
for (j = 0; j < num_variants; j++) {
974+
a12[2 * j] = 1;
975+
}
976+
/* printf("variants=%d samples=%d\n", (int) num_variants, (int) num_samples); */
977+
vcz_encode_plink(num_variants, num_samples, genotypes, a12, buf);
978+
for (j = 0; j < num_variants * num_samples / 4; j++) {
979+
/* printf("%d %d\n", (int) j, buf[j]); */
980+
CU_ASSERT_EQUAL_FATAL(buf[j], -1);
981+
}
982+
983+
free(genotypes);
984+
free(a12);
985+
free(buf);
986+
}
987+
988+
static void
989+
test_encode_plink_all_zeros(void)
990+
{
991+
test_encode_plink_all_zeros_instance(1, 4);
992+
test_encode_plink_all_zeros_instance(10, 4);
993+
test_encode_plink_all_zeros_instance(1, 400);
994+
test_encode_plink_all_zeros_instance(100, 400);
995+
}
996+
960997
/*=================================================
961998
Test suite management
962999
=================================================
@@ -1073,6 +1110,7 @@ main(int argc, char **argv)
10731110
{ "test_encode_plink_single_genotype_vary_a12",
10741111
test_encode_plink_single_genotype_vary_a12 },
10751112
{ "test_encode_plink_example", test_encode_plink_example },
1113+
{ "test_encode_plink_all_zeros", test_encode_plink_all_zeros },
10761114
{ NULL, NULL },
10771115
};
10781116
return test_main(tests, argc, argv);

lib/vcf_encoder.c

Lines changed: 2 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -998,48 +998,6 @@ vcz_variant_encoder_free(vcz_variant_encoder_t *self)
998998
}
999999
}
10001000

1001-
/* def encode_genotypes(g, allele_1, allele_2): */
1002-
/* # Missing genotype: 01 in PLINK format */
1003-
/* # Homozygous allele 1: 00 in PLINK format */
1004-
/* # Homozygous allele 2: 11 in PLINK format */
1005-
/* # Heterozygous: 10 in PLINK format */
1006-
/* HOM_A1 = 0b00 */
1007-
/* HOM_A2 = 0b11 */
1008-
/* HET = 0b10 */
1009-
/* MISSING = 0b01 */
1010-
1011-
/* num_samples = g.shape[0] */
1012-
/* assert g.shape[1] == 2 */
1013-
/* bytes_per_variant = (num_samples + 3) // 4 */
1014-
/* buff = bytearray(bytes_per_variant) */
1015-
/* for j in range(num_samples): */
1016-
/* byte_idx = j // 4 */
1017-
/* bit_pos = (j % 4) * 2 */
1018-
/* code = MISSING */
1019-
/* a, b = g[j] */
1020-
/* if b == -2: */
1021-
/* # Treated as a haploid call by plink */
1022-
/* if a == allele_1: */
1023-
/* code = HOM_A1 */
1024-
/* elif a == allele_2: */
1025-
/* code = HOM_A2 */
1026-
/* else: */
1027-
/* if a == allele_1: */
1028-
/* if b == allele_1: */
1029-
/* code = HOM_A1 */
1030-
/* elif b == allele_2: */
1031-
/* code = HET */
1032-
/* elif a == allele_2: */
1033-
/* if b == allele_2: */
1034-
/* code = HOM_A2 */
1035-
/* elif b == allele_1: */
1036-
/* code = HET */
1037-
/* if allele_1 == -1 and (code == HOM_A1 or code == HET): */
1038-
/* code = MISSING */
1039-
/* # print("\t", a, b, code) */
1040-
/* mask = ~(0b11 << bit_pos) */
1041-
/* buff[byte_idx] = (buff[byte_idx] & mask) | (code << bit_pos) */
1042-
/* return buff */
10431001
int
10441002
vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotypes,
10451003
const int8_t *a12_allele, char *buf)
@@ -1087,7 +1045,8 @@ vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotype
10871045
}
10881046
}
10891047

1090-
/* printf("a=%d b=%d a1=%d a2=%d code = %d\n", a, b, allele_1, allele_2, code); */
1048+
/* printf("a=%d b=%d a1=%d a2=%d code = %d\n", a, b, allele_1, allele_2,
1049+
* code); */
10911050
byte_offset = variant_offset + k / 4;
10921051
bit_pos = (k % 4) * 2;
10931052
mask = ~(0x3 << bit_pos);

tests/test_plink.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from vcztools import plink
55

66

7-
def _encode_genotypes(g, allele_1, allele_2):
7+
def _encode_genotypes_row(g, allele_1, allele_2):
88
# Missing genotype: 01 in PLINK format
99
# Homozygous allele 1: 00 in PLINK format
1010
# Homozygous allele 2: 11 in PLINK format
@@ -57,7 +57,7 @@ def encode_genotypes(G, a12_allele=None):
5757
assert G.shape[2] == 2
5858
buff = bytearray()
5959
for j in range(len(G)):
60-
buff.extend(_encode_genotypes(G[j], *a12_allele[j]))
60+
buff.extend(_encode_genotypes_row(G[j], *a12_allele[j]))
6161
return bytes(buff)
6262

6363

@@ -107,11 +107,46 @@ def test_examples_01_alleles(self, genotypes):
107107
(1, 101),
108108
(1, 101),
109109
(10, 1),
110+
(100, 1),
111+
(10, 2),
112+
(10, 3),
113+
(10, 4),
114+
(10, 5),
115+
(10, 6),
116+
(10, 7),
117+
(10, 8),
118+
(10, 9),
110119
],
111120
)
112-
def test_shapes_01_alleles(self, num_variants, num_samples):
113-
g = np.zeros((num_variants, num_samples, 2), dtype=np.int8)
121+
@pytest.mark.parametrize("value", [-1, 0, 1, 2])
122+
def test_shapes_01_alleles(self, value, num_variants, num_samples):
123+
g = np.zeros((num_variants, num_samples, 2), dtype=np.int8) + value
114124
b1 = encode_genotypes(g)
115125
b2 = plink.encode_genotypes(g)
116126
# assert len(b1) == len(b2)
117127
assert b1 == b2
128+
129+
@pytest.mark.parametrize(
130+
["num_variants", "num_samples"],
131+
[
132+
(1, 4),
133+
(1, 4),
134+
(1, 8),
135+
(1, 16),
136+
(1, 32),
137+
(1, 100),
138+
(33, 4),
139+
(33, 4),
140+
(33, 8),
141+
(33, 16),
142+
(33, 32),
143+
(33, 100),
144+
],
145+
)
146+
def test_all_zeros_div_4(self, num_variants, num_samples):
147+
assert num_samples % 4 == 0
148+
g = np.zeros((num_variants, num_samples, 2), dtype=np.int8)
149+
b1 = encode_genotypes(g)
150+
b2 = plink.encode_genotypes(g)
151+
assert b1 == b2
152+
assert b1 == bytearray(0xFF for _ in range(num_variants * num_samples // 4))

vcztools/plink.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Convert VCZ to plink 1 binary format.
33
"""
44

5+
import time
6+
57
import numpy as np
68
import pandas as pd
79
import zarr
@@ -117,11 +119,25 @@ def _write_genotypes(self):
117119
with open(self.bed_path, "wb") as bed_file:
118120
bed_file.write(bytes([0x6C, 0x1B, 0x01]))
119121
for v_chunk in range(call_genotype.cdata_shape[0]):
122+
before = time.perf_counter()
120123
G = call_genotype.blocks[v_chunk]
124+
duration = time.perf_counter() - before
125+
print(
126+
f"Decoded genotypes {G.nbytes / 1024**2:.2f}MiB in {duration:.2g}s"
127+
)
128+
129+
before = time.perf_counter()
121130
a12 = self._compute_alleles(G, variant_allele.blocks[v_chunk])
122-
print("Got a12")
131+
duration = time.perf_counter() - before
132+
133+
print(f"a12 alleles in {duration:.2g}s")
134+
before = time.perf_counter()
123135
buff = encode_genotypes(G, a12)
124-
print("Genotypes")
136+
duration = time.perf_counter() - before
137+
print(
138+
f"encoded in {duration:.2g}s @ {(G.nbytes / 1024**2) / duration:.2f}MiB/s"
139+
)
140+
125141
bed_file.write(buff)
126142
a12_allele.blocks[v_chunk] = a12
127143
return a12_allele[:]

0 commit comments

Comments
 (0)