From 25112d1d2bb9430ae95dc897e618a1b9568abf1e Mon Sep 17 00:00:00 2001
From: yoid2000 <paul@francis.com>
Date: Thu, 18 Jan 2024 17:16:53 +0100
Subject: [PATCH 1/4] Added check for safe string values

---
 .gitignore               |  3 ++
 syndiffix/microdata.py   | 37 ++++++++++++++++----
 syndiffix/synthesizer.py |  2 ++
 tests/test_microdata.py  | 74 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1848b27..a1353f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,6 @@ __pycache__/
 
 # Ignore logs
 *.log
+
+# Ignore vscode workspace virtual environment
+/venv/
\ No newline at end of file
diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index 8ca38d4..6e9d936 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -18,6 +18,7 @@
 from .bucket import Buckets
 from .common import ColumnType, Value
 from .interval import Interval, Intervals
+from .tree import Branch, Leaf, Node
 
 MICRODATA_SYN_VALUE: Literal[0] = 0
 MICRODATA_FLOAT_VALUE: Literal[1] = 1
@@ -42,6 +43,9 @@ def to_float(self, value: Value) -> float:
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         pass
 
+    def map_tree(self, root: Node) -> None:
+        pass
+
 
 class BooleanConvertor(DataConvertor):
     def column_type(self) -> ColumnType:
@@ -104,6 +108,7 @@ def __init__(self, values: Iterable[Value]) -> None:
             if not isinstance(value, str):
                 raise TypeError(f"Not a `str` object in a string dtype column: {value}.")
         self.value_map = sorted(cast(Set[str], unique_values))
+        self.safe_values: Set[int] = set()
 
     def column_type(self) -> ColumnType:
         return ColumnType.STRING
@@ -120,13 +125,31 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
             return self._map_interval(interval, rng)
 
     def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        # Finds a common prefix of the strings encoded as interval boundaries and appends "*"
-        # and a random number to ensure that the count of distinct values approximates that in the original data.
-        min_value = self.value_map[int(interval.min)]
-        max_value = self.value_map[min(int(interval.max), len(self.value_map) - 1)]
-        value = int(_generate_float(interval, rng))
-
-        return (commonprefix([min_value, max_value]) + "*" + str(value), float(value))
+        # If a randomly selected value from the interval is not safe, finds a common prefix of the strings encoded as interval boundaries and appends "*" and a random number to ensure that the count of distinct values approximates that in the original data.
+        min_value = int(interval.min)
+        # max_value is inclusive
+        max_value = min(int(interval.max) - 1, len(self.value_map) - 1)
+        value = rng.randint(min_value, max_value)
+        if value in self.safe_values:
+            return (self.value_map[value], float(value))
+        else:
+            print(f"No safe value for {float(value)}")
+            return (
+                commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value),
+                float(value),
+            )
+
+    def map_tree(self, root: Node) -> None:
+        def map_tree_walk(node: Node) -> None:
+            if isinstance(node, Leaf):
+                low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold
+                if node.is_singularity() and node.is_over_threshold(low_threshold):
+                    self.safe_values.add(int(node.actual_intervals[0].min))
+            elif isinstance(node, Branch):
+                for child_node in node.children.values():
+                    map_tree_walk(child_node)
+
+        map_tree_walk(root)
 
 
 def _generate_float(interval: Interval, rng: Random) -> float:
diff --git a/syndiffix/synthesizer.py b/syndiffix/synthesizer.py
index 8dbb939..7da46d8 100644
--- a/syndiffix/synthesizer.py
+++ b/syndiffix/synthesizer.py
@@ -98,6 +98,8 @@ def __init__(
         )
 
         self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest)
+        for col_id, converter in enumerate(self.column_convertors):
+            converter.map_tree(self.forest.get_tree((ColumnId(col_id),)))
 
     def sample(self) -> pd.DataFrame:
         def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]:
diff --git a/tests/test_microdata.py b/tests/test_microdata.py
index 7a5ecf1..3515229 100644
--- a/tests/test_microdata.py
+++ b/tests/test_microdata.py
@@ -1,16 +1,48 @@
+import string
 from io import StringIO
 from random import Random
 
+import numpy as np
 import pandas as pd
 import pytest
 
+from syndiffix import Synthesizer
 from syndiffix.bucket import Bucket
 from syndiffix.interval import Interval
 from syndiffix.microdata import *
 
+from .conftest import *
+
 _rng = Random(0)
 
 
+def _make_safe_values_df() -> pd.DataFrame:
+    # Each column in this dataframe has 10 instances each of 30 distinct strings. This ensures that distinct string is safe (passes LCF). However since there are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets
+    columns = ["a", "b", "c"]
+    values = []
+    for i, column in enumerate(columns):
+        values.append([column + str(x) for x in range(1, 31)])
+        values[i] = values[i] * 10
+        np.random.shuffle(values[i])
+    return pd.DataFrame(
+        {
+            columns[0]: values[0],
+            columns[1]: values[1],
+            columns[2]: values[2],
+        }
+    )
+
+
+def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None:
+    # This takes one or more distinct values in each column, and changes every instance to a random value, thus ensuring that some 1dim values will fail LCF, producing non-singularity leafs
+    def ran_str10() -> str:
+        return "".join(random.choice(string.ascii_letters) for i in range(10))
+
+    for column in df.columns:
+        for value_to_tweak in values_to_tweak:
+            df[column] = df[column].apply(lambda x: ran_str10() if str(x).endswith(str(value_to_tweak)) else x)
+
+
 def _get_convertors(df: pd.DataFrame) -> list[DataConvertor]:
     return [get_convertor(df, column) for column in df.columns]
 
@@ -164,3 +196,45 @@ def test_empty_bucket_list() -> None:
 
 def test_empty_interval_list() -> None:
     assert generate_microdata([Bucket((), 2)], [], [], _rng) == [[], []]
+
+
+def test_safe_values_set_all() -> None:
+    data = _make_safe_values_df()
+    convertors = _get_convertors(data)
+    results = apply_convertors(convertors, data)
+    assert results.shape == data.shape
+    forest = create_forest(results)
+    for col_id, converter in enumerate(convertors):
+        converter.map_tree(forest.get_tree((ColumnId(col_id),)))
+    for col_id, column in enumerate(data.columns):
+        assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values)
+
+
+def test_safe_values_set_most() -> None:
+    data = _make_safe_values_df()
+    nuniques = [data[col].nunique() for col in data.columns]
+    _tweak_safe_values_df(data)
+    convertors = _get_convertors(data)
+    results = apply_convertors(convertors, data)
+    assert results.shape == data.shape
+    forest = create_forest(results)
+    for col_id, converter in enumerate(convertors):
+        converter.map_tree(forest.get_tree((ColumnId(col_id),)))
+    for col_id, column in enumerate(data.columns):
+        assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1
+
+
+def test_safe_values_e2e_all() -> None:
+    data = _make_safe_values_df()
+    syn_data = Synthesizer(data).sample()
+    for column in syn_data:
+        assert syn_data[column].apply(lambda x: "*" in str(x)).sum() == 0
+
+
+def test_safe_values_e2e_some() -> None:
+    data = _make_safe_values_df()
+    # By tweaking multiple distinct values, we ensure that there will be buckets with no safe values, thus forcing "*" values
+    _tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26])
+    syn_data = Synthesizer(data).sample()
+    for column in syn_data:
+        assert syn_data[column].apply(lambda x: "*" in str(x)).sum() != 0

From 0737ae75e8ca60955607bc4dd2e4a5d2a7f77401 Mon Sep 17 00:00:00 2001
From: yoid2000 <paul@francis.com>
Date: Fri, 19 Jan 2024 09:44:33 +0100
Subject: [PATCH 2/4] change map_tree to analyze_tree

---
 syndiffix/microdata.py   | 11 +++++------
 syndiffix/synthesizer.py |  2 +-
 tests/test_microdata.py  | 15 ++++++++++-----
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index 6e9d936..df34b73 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -43,7 +43,7 @@ def to_float(self, value: Value) -> float:
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         pass
 
-    def map_tree(self, root: Node) -> None:
+    def analyze_tree(self, root: Node) -> None:
         pass
 
 
@@ -133,23 +133,22 @@ def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         if value in self.safe_values:
             return (self.value_map[value], float(value))
         else:
-            print(f"No safe value for {float(value)}")
             return (
                 commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value),
                 float(value),
             )
 
-    def map_tree(self, root: Node) -> None:
-        def map_tree_walk(node: Node) -> None:
+    def analyze_tree(self, root: Node) -> None:
+        def analyze_tree_walk(node: Node) -> None:
             if isinstance(node, Leaf):
                 low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold
                 if node.is_singularity() and node.is_over_threshold(low_threshold):
                     self.safe_values.add(int(node.actual_intervals[0].min))
             elif isinstance(node, Branch):
                 for child_node in node.children.values():
-                    map_tree_walk(child_node)
+                    analyze_tree_walk(child_node)
 
-        map_tree_walk(root)
+        analyze_tree_walk(root)
 
 
 def _generate_float(interval: Interval, rng: Random) -> float:
diff --git a/syndiffix/synthesizer.py b/syndiffix/synthesizer.py
index 7da46d8..b0781f6 100644
--- a/syndiffix/synthesizer.py
+++ b/syndiffix/synthesizer.py
@@ -99,7 +99,7 @@ def __init__(
 
         self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest)
         for col_id, converter in enumerate(self.column_convertors):
-            converter.map_tree(self.forest.get_tree((ColumnId(col_id),)))
+            converter.analyze_tree(self.forest.get_tree((ColumnId(col_id),)))
 
     def sample(self) -> pd.DataFrame:
         def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]:
diff --git a/tests/test_microdata.py b/tests/test_microdata.py
index 3515229..48f3c6d 100644
--- a/tests/test_microdata.py
+++ b/tests/test_microdata.py
@@ -17,7 +17,9 @@
 
 
 def _make_safe_values_df() -> pd.DataFrame:
-    # Each column in this dataframe has 10 instances each of 30 distinct strings. This ensures that distinct string is safe (passes LCF). However since there are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets
+    # Each column in this dataframe has 10 instances each of 30 distinct strings.
+    # This ensures that distinct string is safe (passes LCF). However since there
+    # are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets
     columns = ["a", "b", "c"]
     values = []
     for i, column in enumerate(columns):
@@ -34,7 +36,9 @@ def _make_safe_values_df() -> pd.DataFrame:
 
 
 def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None:
-    # This takes one or more distinct values in each column, and changes every instance to a random value, thus ensuring that some 1dim values will fail LCF, producing non-singularity leafs
+    # This takes one or more distinct values in each column, and changes every
+    # instance to a random value, thus ensuring that some 1dim values will
+    # fail LCF, producing non-singularity leafs
     def ran_str10() -> str:
         return "".join(random.choice(string.ascii_letters) for i in range(10))
 
@@ -205,7 +209,7 @@ def test_safe_values_set_all() -> None:
     assert results.shape == data.shape
     forest = create_forest(results)
     for col_id, converter in enumerate(convertors):
-        converter.map_tree(forest.get_tree((ColumnId(col_id),)))
+        converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
     for col_id, column in enumerate(data.columns):
         assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values)
 
@@ -219,7 +223,7 @@ def test_safe_values_set_most() -> None:
     assert results.shape == data.shape
     forest = create_forest(results)
     for col_id, converter in enumerate(convertors):
-        converter.map_tree(forest.get_tree((ColumnId(col_id),)))
+        converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
     for col_id, column in enumerate(data.columns):
         assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1
 
@@ -233,7 +237,8 @@ def test_safe_values_e2e_all() -> None:
 
 def test_safe_values_e2e_some() -> None:
     data = _make_safe_values_df()
-    # By tweaking multiple distinct values, we ensure that there will be buckets with no safe values, thus forcing "*" values
+    # By tweaking multiple distinct values, we ensure that there will be buckets
+    # with no safe values, thus forcing "*" values
     _tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26])
     syn_data = Synthesizer(data).sample()
     for column in syn_data:

From 507d464817f119eefad631a6dde372418d38415c Mon Sep 17 00:00:00 2001
From: yoid2000 <paul@francis.com>
Date: Fri, 19 Jan 2024 09:47:57 +0100
Subject: [PATCH 3/4] FIxed over-long comment

---
 syndiffix/microdata.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index df34b73..a44715f 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -125,7 +125,10 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
             return self._map_interval(interval, rng)
 
     def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        # If a randomly selected value from the interval is not safe, finds a common prefix of the strings encoded as interval boundaries and appends "*" and a random number to ensure that the count of distinct values approximates that in the original data.
+        # If a randomly selected value from the interval is not safe, finds a
+        # common prefix of the strings encoded as interval boundaries and
+        # appends "*" and a random number to ensure that the count of distinct
+        # values approximates that in the original data.
         min_value = int(interval.min)
         # max_value is inclusive
         max_value = min(int(interval.max) - 1, len(self.value_map) - 1)

From 652bdc22a870f1ddcc999420c6d8832d5dee7d3f Mon Sep 17 00:00:00 2001
From: yoid2000 <paul@francis.com>
Date: Fri, 19 Jan 2024 10:16:19 +0100
Subject: [PATCH 4/4] Update algorithm.md

---
 docs/algorithm.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/algorithm.md b/docs/algorithm.md
index 7065604..a79a1e7 100644
--- a/docs/algorithm.md
+++ b/docs/algorithm.md
@@ -106,13 +106,15 @@ and its snapped ranges elsewhere.
 - For each harvested bucket:
   - For 1..bucket.count:
     - For each range in the bucket:
-      - Generate a random float value inside the range.
+      - Generate a random float value inside the range. In the case of a **string** column, the upper edge of the range must not exceed the distinct values sorted array.
       - Cast the generated value back to the corresponding column type.
         - In the case of **string** columns:
           - If the range is a singularity:
             - Return the exact value.
           - Else:
-            - Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
+            - If the random value is a singularity in the corresponding 1-dimensional tree and passes LCF, return the exact value.
+            - Else:
+              - Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
     - Return a synthetic row from the generated values.
 
 ### Clustering