sgkit-dev · jeromekelleher · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/lib/tests.c b/lib/tests.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdint.h>
+#include <float.h>
 #include <vcf_encoder.h>
 
 FILE *_devnull;
@@ -25,9 +26,47 @@ test_int_field_1d(void)
 
     for (j = 0; j < num_rows; j++) {
         ret = vcz_field_write(&field, j, buf, 1000, 0);
-        /* printf("%s: %s\n", buf, expected[j]); */
+        /* printf("ret = %d\n", (int)ret); */
+        /* printf("'%.*s': %s\n", (int) ret, buf, expected[j]); */
         CU_ASSERT_EQUAL_FATAL(ret, strlen(expected[j]));
-        CU_ASSERT_STRING_EQUAL(buf, expected[j]);
+        CU_ASSERT_NSTRING_EQUAL(buf, expected[j], ret);
+    }
+}
+
+static void
+test_int_field_1d_overflow(void)
+{
+    const int32_t data[] = { 1, 2, 12345789, -100, INT32_MIN, INT32_MAX, -1 };
+    const size_t num_rows = sizeof(data) / sizeof(*data);
+    vcz_field_t field = { .name = "test",
+        .type = VCZ_TYPE_INT,
+        .item_size = 4,
+        .num_columns = 1,
+        .data = (const char *) data };
+    int64_t ret;
+    size_t j, buflen;
+    char *buf;
+
+    for (j = 0; j < num_rows - 1; j++) {
+        /* printf("%d\n", (int) data[j]); */
+        for (buflen = 0; buflen <= VCZ_INT32_BUF_SIZE; buflen++) {
+            /* printf("buflen = %d\n", (int) buflen); */
+            buf = malloc(buflen);
+            CU_ASSERT_FATAL(buf != NULL);
+            ret = vcz_field_write(&field, j, buf, buflen, 0);
+            free(buf);
+            CU_ASSERT_FATAL(ret == VCZ_ERR_BUFFER_OVERFLOW);
+        }
+    }
+    j = num_rows - 1;
+    CU_ASSERT_FATAL(data[j] == -1);
+    /* Missing data is treated differently. Just need 2 bytes for ".\t" */
+    for (buflen = 0; buflen < 2; buflen++) {
+        buf = malloc(buflen);
+        CU_ASSERT_FATAL(buf != NULL);
+        ret = vcz_field_write(&field, j, buf, buflen, 0);
+        free(buf);
+        CU_ASSERT_FATAL(ret == VCZ_ERR_BUFFER_OVERFLOW);
     }
 }
 
@@ -50,7 +89,75 @@ test_int_field_2d(void)
         ret = vcz_field_write(&field, j, buf, 1000, 0);
         CU_ASSERT_EQUAL_FATAL(ret, strlen(expected[j]));
         /* printf("%s: %s\n", buf, expected[j]); */
-        CU_ASSERT_STRING_EQUAL(buf, expected[j]);
+        CU_ASSERT_NSTRING_EQUAL_FATAL(buf, expected[j], ret);
+    }
+}
+
+static void
+test_float_field_1d(void)
+{
+    float data[] = { 1.0f, 2.1f, INT32_MIN, 12345789.0f, -1, -100.123f, 0 };
+
+    const size_t num_rows = sizeof(data) / sizeof(*data);
+    vcz_field_t field = { .name = "test",
+        .type = VCZ_TYPE_FLOAT,
+        .item_size = 4,
+        .num_columns = 1,
+        .data = (const char *) data };
+    char buf[1000];
+    const char *expected[]
+        = { "1\t", "2.1\t", "-2147483648\t", "12345789\t", "-1\t", "-100.123\t", ".\t" };
+    int64_t ret;
+    size_t j;
+    int32_t *int_data = (int32_t *) data;
+
+    int_data[num_rows - 1] = VCZ_FLOAT32_MISSING_AS_INT32;
+
+    for (j = 0; j < num_rows; j++) {
+        ret = vcz_field_write(&field, j, buf, 1000, 0);
+        printf("ret = %d\n", (int)ret);
+        printf("'%.*s':'%s'\n", (int) ret, buf, expected[j]);
+        CU_ASSERT_EQUAL_FATAL(ret, strlen(expected[j]));
+        CU_ASSERT_NSTRING_EQUAL(buf, expected[j], ret);
+    }
+}
+
+static void
+test_float_field_1d_overflow(void)
+{
+    float data[] = { 1.0f, 2.1f, 12345789.0f, (float) M_PI, -1, -100.123f, 0 };
+    const size_t num_rows = sizeof(data) / sizeof(*data);
+    vcz_field_t field = { .name = "test",
+        .type = VCZ_TYPE_FLOAT,
+        .item_size = 4,
+        .num_columns = 1,
+        .data = (const char *) data };
+    int64_t ret;
+    size_t j, buflen;
+    char *buf;
+    int32_t *int_data = (int32_t *) data;
+
+    int_data[num_rows - 1] = VCZ_FLOAT32_MISSING_AS_INT32;
+
+    for (j = 0; j < num_rows - 1; j++) {
+        /* printf("%d\n", (int) data[j]); */
+        for (buflen = 0; buflen <= VCZ_FLOAT32_BUF_SIZE; buflen++) {
+            /* printf("buflen = %d\n", (int) buflen); */
+            buf = malloc(buflen);
+            CU_ASSERT_FATAL(buf != NULL);
+            ret = vcz_field_write(&field, j, buf, buflen, 0);
+            free(buf);
+            CU_ASSERT_FATAL(ret == VCZ_ERR_BUFFER_OVERFLOW);
+        }
+    }
+    j = num_rows - 1;
+    /* Missing data is treated differently. Just need 2 bytes for ".\t" */
+    for (buflen = 0; buflen < 2; buflen++) {
+        buf = malloc(buflen);
+        CU_ASSERT_FATAL(buf != NULL);
+        ret = vcz_field_write(&field, j, buf, buflen, 0);
+        free(buf);
+        CU_ASSERT_FATAL(ret == VCZ_ERR_BUFFER_OVERFLOW);
     }
 }
 
@@ -337,7 +444,14 @@ test_ftoa(void)
         {2311380, "2311380"},
         {16777216, "16777216"}, /* Maximum integer value of float */
         {-16777216, "-16777216"},
+        {INT32_MIN, "-2147483648"},
+        {(float) INT32_MAX, "2147483648"},
+        {(float) DBL_MAX, "inf",},
+        {(float) DBL_MIN, "0",},
+        {FLT_MIN, "0",},
         /* TODO test extreme value here, that push against the limits of f32 */
+        // FAILS https://github.com/jeromekelleher/vcztools/issues/21
+        /* {FLT_MAX, "340282346638528859811704183484516925440",}, */
     };
     // clang-format on
     int len;
@@ -346,7 +460,7 @@ test_ftoa(void)
 
     for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) {
         len = vcz_ftoa(buf, cases[j].val);
-        /* printf("j = %d %f->%s=='%s'\n", j, cases[j].val, cases[j].expected, buf); */
+        /* printf("j = %d %f->%s=='%s'\n", (int) j, cases[j].val, cases[j].expected, buf); */
         CU_ASSERT_EQUAL_FATAL(len, strlen(cases[j].expected));
         CU_ASSERT_STRING_EQUAL_FATAL(buf, cases[j].expected);
     }
@@ -441,7 +555,10 @@ main(int argc, char **argv)
         { "test_string_field_1d", test_string_field_1d },
         { "test_string_field_2d", test_string_field_2d },
         { "test_int_field_1d", test_int_field_1d },
+        { "test_int_field_1d_overflow", test_int_field_1d_overflow },
         { "test_int_field_2d", test_int_field_2d },
+        { "test_float_field_1d", test_float_field_1d },
+        { "test_float_field_1d_overflow", test_float_field_1d_overflow },
         { "test_variant_encoder_minimal", test_variant_encoder_minimal },
         { "test_variant_fields_all_missing", test_variant_encoder_fields_all_missing },
         { "test_itoa_small", test_itoa_small },

diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c
@@ -8,9 +8,8 @@
 #include <math.h>
 
 int
-vcz_itoa(char *buf, int32_t v)
+vcz_itoa(char *restrict buf, int64_t value)
 {
-    int64_t value = v;
     int p = 0;
     int j, k;
 
@@ -19,6 +18,9 @@ vcz_itoa(char *buf, int32_t v)
         p++;
         value = -value;
     }
+    /* We only support int32_t values. The +1 here is for supporting the
+     * float converter below */
+    assert(value <= (1LL + INT32_MAX));
     /*  special case small values */
     if (value < 10) {
         buf[p] = (char) value + '0';
@@ -42,9 +44,8 @@ vcz_itoa(char *buf, int32_t v)
         } else if (value < 1000000000) {
             k = 8;
         } else if (value < 10000000000) {
+            // Largest possible INT32 value
             k = 9;
-        } else {
-            assert(false);
         }
 
         // iterate backwards in buf
@@ -62,7 +63,7 @@ vcz_itoa(char *buf, int32_t v)
 }
 
 int
-vcz_ftoa(char *buf, float value)
+vcz_ftoa(char *restrict buf, float value)
 {
     int p = 0;
     int64_t i, d1, d2, d3;
@@ -83,7 +84,9 @@ vcz_ftoa(char *buf, float value)
 
     /* integer part */
     i = (int64_t) round(((double) value) * 1000);
-    p += vcz_itoa(buf + p, (int32_t)(i / 1000));
+    /* printf("i = %ld\n", i); */
+    /* printf("i/ 1000 = %ld\n", i / 1000); */
+    p += vcz_itoa(buf + p, i / 1000);
 
     /* fractional part */
     d3 = i % 10;
@@ -107,6 +110,41 @@ vcz_ftoa(char *buf, float value)
     return p;
 }
 
+static inline int64_t
+append_char(char *restrict dest, char c, int64_t offset, int64_t buflen)
+{
+    if (offset == buflen) {
+        return VCZ_ERR_BUFFER_OVERFLOW;
+    }
+    dest[offset] = c;
+    return offset + 1;
+}
+
+static inline int64_t
+append_int(char *restrict dest, int32_t value, int64_t offset, int64_t buflen)
+{
+    if (value == VCZ_INT_MISSING) {
+        return append_char(dest, '.', offset, buflen);
+    }
+    if (offset + VCZ_INT32_BUF_SIZE >= buflen) {
+        return VCZ_ERR_BUFFER_OVERFLOW;
+    }
+    return offset + vcz_itoa(dest + offset, value);
+}
+
+static inline int64_t
+append_float(char *restrict dest, int32_t int32_value, float value, int64_t offset,
+    int64_t buflen)
+{
+    if (int32_value == VCZ_FLOAT32_MISSING_AS_INT32) {
+        return append_char(dest, '.', offset, buflen);
+    }
+    if (offset + VCZ_FLOAT32_BUF_SIZE >= buflen) {
+        return VCZ_ERR_BUFFER_OVERFLOW;
+    }
+    return offset + vcz_ftoa(dest + offset, value);
+}
+
 static bool
 bool_all_missing(const int8_t *restrict data, size_t n)
 {
@@ -192,42 +230,39 @@ bool_field_write_entry(const vcz_field_t *VCZ_UNUSED(self), const void *VCZ_UNUS
 }
 
 static int64_t
-int32_field_write_entry(const vcz_field_t *self, const void *data, char *dest,
-    size_t VCZ_UNUSED(buflen), int64_t offset)
+int32_field_write_entry(const vcz_field_t *self, const void *restrict data, char *dest,
+    int64_t buflen, int64_t offset)
 {
-    const int32_t *source = (const int32_t *) data;
-    int32_t value;
+    const int32_t *restrict source = (const int32_t *) data;
     size_t column;
 
     for (column = 0; column < self->num_columns; column++) {
-        value = source[column];
-        if (value != VCZ_INT_FILL) {
-            if (column > 0) {
-                dest[offset] = ',';
-                offset++;
-            }
-            if (value == VCZ_INT_MISSING) {
-                dest[offset] = '.';
-                offset++;
-            } else {
-                offset += vcz_itoa(dest + offset, value);
+        if (source[column] == VCZ_INT_FILL) {
+            break;
+        }
+        if (column > 0) {
+            offset = append_char(dest, ',', offset, buflen);
+            if (offset < 0) {
+                goto out;
             }
         }
+        offset = append_int(dest, source[column], offset, buflen);
+        if (offset < 0) {
+            goto out;
+        }
     }
-    dest[offset] = '\t';
-    offset++;
-    dest[offset] = '\0';
+    offset = append_char(dest, '\t', offset, buflen);
+out:
     return offset;
 }
 
 static int64_t
-float32_field_write_entry(const vcz_field_t *self, const void *data, char *dest,
-    size_t VCZ_UNUSED(buflen), int64_t offset)
+float32_field_write_entry(const vcz_field_t *self, const void *restrict data,
+    char *restrict dest, int64_t buflen, int64_t offset)
 {
-    const float *source = (const float *) data;
-    const int32_t *int32_source = (const int32_t *) data;
+    const float *restrict source = (const float *restrict) data;
+    const int32_t *restrict int32_source = (const int32_t *restrict) data;
     int32_t int32_value;
-    float value;
     size_t column;
 
     for (column = 0; column < self->num_columns; column++) {
@@ -236,20 +271,18 @@ float32_field_write_entry(const vcz_field_t *self, const void *data, char *dest,
             break;
         }
         if (column > 0) {
-            dest[offset] = ',';
-            offset++;
+            offset = append_char(dest, ',', offset, buflen);
+            if (offset < 0) {
+                goto out;
+            }
         }
-        if (int32_value == VCZ_FLOAT32_MISSING_AS_INT32) {
-            dest[offset] = '.';
-            offset++;
-        } else {
-            value = source[column];
-            offset += vcz_ftoa(dest + offset, value);
+        offset = append_float(dest, int32_value, source[column], offset, buflen);
+        if (offset < 0) {
+            goto out;
         }
     }
-    dest[offset] = '\t';
-    offset++;
-    dest[offset] = '\0';
+    offset = append_char(dest, '\t', offset, buflen);
+out:
     return offset;
 }
 
@@ -259,11 +292,11 @@ vcz_field_write_entry(
 {
     if (self->type == VCZ_TYPE_INT) {
         if (self->item_size == 4) {
-            return int32_field_write_entry(self, data, dest, buflen, offset);
+            return int32_field_write_entry(self, data, dest, (int64_t) buflen, offset);
         }
     } else if (self->type == VCZ_TYPE_FLOAT) {
         assert(self->item_size == 4);
-        return float32_field_write_entry(self, data, dest, buflen, offset);
+        return float32_field_write_entry(self, data, dest, (int64_t) buflen, offset);
     } else if (self->type == VCZ_TYPE_BOOL) {
         assert(self->item_size == 1);
         assert(self->num_columns == 1);

diff --git a/lib/vcf_encoder.h b/lib/vcf_encoder.h
@@ -31,8 +31,11 @@
 
 // arbitrary - we can increase if needs be
 #define VCZ_MAX_FIELD_NAME_LEN 256
+#define VCZ_INT32_BUF_SIZE 11 // -2147483648
+#define VCZ_FLOAT32_BUF_SIZE 15 // An int + "." and 3 decimal places
 
 #define VCZ_ERR_NO_MEMORY (-100)
+#define VCZ_ERR_BUFFER_OVERFLOW (-101)
 
 /* Built-in-limitations */
 #define VCZ_ERR_FIELD_NAME_TOO_LONG (-201)
@@ -93,5 +96,5 @@ int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char *
 int64_t vcz_variant_encoder_write_row(
     const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen);
 
-int vcz_itoa(char *buf, int32_t v);
+int vcz_itoa(char *buf, int64_t v);
 int vcz_ftoa(char *buf, float v);