Merge pull request #131 from diffix/paul-safe-values

Added check for safe string values
diffix · Jan 19, 2024 · 3a463e8 · 3a463e8
2 parents 0b2204a + 652bdc2
commit 3a463e8
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,6 @@ __pycache__/
 
 # Ignore logs
 *.log
+
+# Ignore vscode workspace virtual environment
+/venv/
diff --git a/docs/algorithm.md b/docs/algorithm.md
@@ -106,13 +106,15 @@ and its snapped ranges elsewhere.
 - For each harvested bucket:
   - For 1..bucket.count:
     - For each range in the bucket:
-      - Generate a random float value inside the range.
+      - Generate a random float value inside the range. In the case of a **string** column, the upper edge of the range must not exceed the distinct values sorted array.
       - Cast the generated value back to the corresponding column type.
         - In the case of **string** columns:
           - If the range is a singularity:
             - Return the exact value.
           - Else:
-            - Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
+            - If the random value is a singularity in the corresponding 1-dimensional tree and passes LCF, return the exact value.
+            - Else:
+              - Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
     - Return a synthetic row from the generated values.
 
 ### Clustering

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
@@ -18,6 +18,7 @@
 from .bucket import Buckets
 from .common import ColumnType, Value
 from .interval import Interval, Intervals
+from .tree import Branch, Leaf, Node
 
 MICRODATA_SYN_VALUE: Literal[0] = 0
 MICRODATA_FLOAT_VALUE: Literal[1] = 1
@@ -42,6 +43,9 @@ def to_float(self, value: Value) -> float:
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         pass
 
+    def analyze_tree(self, root: Node) -> None:
+        pass
+
 
 class BooleanConvertor(DataConvertor):
     def column_type(self) -> ColumnType:
@@ -104,6 +108,7 @@ def __init__(self, values: Iterable[Value]) -> None:
             if not isinstance(value, str):
                 raise TypeError(f"Not a `str` object in a string dtype column: {value}.")
         self.value_map = sorted(cast(Set[str], unique_values))
+        self.safe_values: Set[int] = set()
 
     def column_type(self) -> ColumnType:
         return ColumnType.STRING
@@ -120,13 +125,33 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
             return self._map_interval(interval, rng)
 
     def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        # Finds a common prefix of the strings encoded as interval boundaries and appends "*"
-        # and a random number to ensure that the count of distinct values approximates that in the original data.
-        min_value = self.value_map[int(interval.min)]
-        max_value = self.value_map[min(int(interval.max), len(self.value_map) - 1)]
-        value = int(_generate_float(interval, rng))
-
-        return (commonprefix([min_value, max_value]) + "*" + str(value), float(value))
+        # If a randomly selected value from the interval is not safe, finds a
+        # common prefix of the strings encoded as interval boundaries and
+        # appends "*" and a random number to ensure that the count of distinct
+        # values approximates that in the original data.
+        min_value = int(interval.min)
+        # max_value is inclusive
+        max_value = min(int(interval.max) - 1, len(self.value_map) - 1)
+        value = rng.randint(min_value, max_value)
+        if value in self.safe_values:
+            return (self.value_map[value], float(value))
+        else:
+            return (
+                commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value),
+                float(value),
+            )
+
+    def analyze_tree(self, root: Node) -> None:
+        def analyze_tree_walk(node: Node) -> None:
+            if isinstance(node, Leaf):
+                low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold
+                if node.is_singularity() and node.is_over_threshold(low_threshold):
+                    self.safe_values.add(int(node.actual_intervals[0].min))
+            elif isinstance(node, Branch):
+                for child_node in node.children.values():
+                    analyze_tree_walk(child_node)
+
+        analyze_tree_walk(root)
 
 
 def _generate_float(interval: Interval, rng: Random) -> float:

diff --git a/syndiffix/synthesizer.py b/syndiffix/synthesizer.py
@@ -98,6 +98,8 @@ def __init__(
         )
 
         self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest)
+        for col_id, converter in enumerate(self.column_convertors):
+            converter.analyze_tree(self.forest.get_tree((ColumnId(col_id),)))
 
     def sample(self) -> pd.DataFrame:
         def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]:

diff --git a/tests/test_microdata.py b/tests/test_microdata.py
@@ -1,16 +1,52 @@
+import string
 from io import StringIO
 from random import Random
 
+import numpy as np
 import pandas as pd
 import pytest
 
+from syndiffix import Synthesizer
 from syndiffix.bucket import Bucket
 from syndiffix.interval import Interval
 from syndiffix.microdata import *
 
+from .conftest import *
+
 _rng = Random(0)
 
 
+def _make_safe_values_df() -> pd.DataFrame:
+    # Each column in this dataframe has 10 instances each of 30 distinct strings.
+    # This ensures that distinct string is safe (passes LCF). However since there
+    # are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets
+    columns = ["a", "b", "c"]
+    values = []
+    for i, column in enumerate(columns):
+        values.append([column + str(x) for x in range(1, 31)])
+        values[i] = values[i] * 10
+        np.random.shuffle(values[i])
+    return pd.DataFrame(
+        {
+            columns[0]: values[0],
+            columns[1]: values[1],
+            columns[2]: values[2],
+        }
+    )
+
+
+def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None:
+    # This takes one or more distinct values in each column, and changes every
+    # instance to a random value, thus ensuring that some 1dim values will
+    # fail LCF, producing non-singularity leafs
+    def ran_str10() -> str:
+        return "".join(random.choice(string.ascii_letters) for i in range(10))
+
+    for column in df.columns:
+        for value_to_tweak in values_to_tweak:
+            df[column] = df[column].apply(lambda x: ran_str10() if str(x).endswith(str(value_to_tweak)) else x)
+
+
 def _get_convertors(df: pd.DataFrame) -> list[DataConvertor]:
     return [get_convertor(df, column) for column in df.columns]
 
@@ -164,3 +200,46 @@ def test_empty_bucket_list() -> None:
 
 def test_empty_interval_list() -> None:
     assert generate_microdata([Bucket((), 2)], [], [], _rng) == [[], []]
+
+
+def test_safe_values_set_all() -> None:
+    data = _make_safe_values_df()
+    convertors = _get_convertors(data)
+    results = apply_convertors(convertors, data)
+    assert results.shape == data.shape
+    forest = create_forest(results)
+    for col_id, converter in enumerate(convertors):
+        converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
+    for col_id, column in enumerate(data.columns):
+        assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values)
+
+
+def test_safe_values_set_most() -> None:
+    data = _make_safe_values_df()
+    nuniques = [data[col].nunique() for col in data.columns]
+    _tweak_safe_values_df(data)
+    convertors = _get_convertors(data)
+    results = apply_convertors(convertors, data)
+    assert results.shape == data.shape
+    forest = create_forest(results)
+    for col_id, converter in enumerate(convertors):
+        converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
+    for col_id, column in enumerate(data.columns):
+        assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1
+
+
+def test_safe_values_e2e_all() -> None:
+    data = _make_safe_values_df()
+    syn_data = Synthesizer(data).sample()
+    for column in syn_data:
+        assert syn_data[column].apply(lambda x: "*" in str(x)).sum() == 0
+
+
+def test_safe_values_e2e_some() -> None:
+    data = _make_safe_values_df()
+    # By tweaking multiple distinct values, we ensure that there will be buckets
+    # with no safe values, thus forcing "*" values
+    _tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26])
+    syn_data = Synthesizer(data).sample()
+    for column in syn_data:
+        assert syn_data[column].apply(lambda x: "*" in str(x)).sum() != 0
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,3 +23,6 @@ __pycache__/ @@
     # Ignore logs
     *.log
+    # Ignore vscode workspace virtual environment
+    /venv/