From 25112d1d2bb9430ae95dc897e618a1b9568abf1e Mon Sep 17 00:00:00 2001 From: yoid2000 Date: Thu, 18 Jan 2024 17:16:53 +0100 Subject: [PATCH 1/4] Added check for safe string values --- .gitignore | 3 ++ syndiffix/microdata.py | 37 ++++++++++++++++---- syndiffix/synthesizer.py | 2 ++ tests/test_microdata.py | 74 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 1848b27..a1353f0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ __pycache__/ # Ignore logs *.log + +# Ignore vscode workspace virtual environment +/venv/ \ No newline at end of file diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index 8ca38d4..6e9d936 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -18,6 +18,7 @@ from .bucket import Buckets from .common import ColumnType, Value from .interval import Interval, Intervals +from .tree import Branch, Leaf, Node MICRODATA_SYN_VALUE: Literal[0] = 0 MICRODATA_FLOAT_VALUE: Literal[1] = 1 @@ -42,6 +43,9 @@ def to_float(self, value: Value) -> float: def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: pass + def map_tree(self, root: Node) -> None: + pass + class BooleanConvertor(DataConvertor): def column_type(self) -> ColumnType: @@ -104,6 +108,7 @@ def __init__(self, values: Iterable[Value]) -> None: if not isinstance(value, str): raise TypeError(f"Not a `str` object in a string dtype column: {value}.") self.value_map = sorted(cast(Set[str], unique_values)) + self.safe_values: Set[int] = set() def column_type(self) -> ColumnType: return ColumnType.STRING @@ -120,13 +125,31 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: return self._map_interval(interval, rng) def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - # Finds a common prefix of the strings encoded as interval boundaries and appends "*" - # and a random number to ensure that the count of distinct values approximates that in the original data. - min_value = self.value_map[int(interval.min)] - max_value = self.value_map[min(int(interval.max), len(self.value_map) - 1)] - value = int(_generate_float(interval, rng)) - - return (commonprefix([min_value, max_value]) + "*" + str(value), float(value)) + # If a randomly selected value from the interval is not safe, finds a common prefix of the strings encoded as interval boundaries and appends "*" and a random number to ensure that the count of distinct values approximates that in the original data. + min_value = int(interval.min) + # max_value is inclusive + max_value = min(int(interval.max) - 1, len(self.value_map) - 1) + value = rng.randint(min_value, max_value) + if value in self.safe_values: + return (self.value_map[value], float(value)) + else: + print(f"No safe value for {float(value)}") + return ( + commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value), + float(value), + ) + + def map_tree(self, root: Node) -> None: + def map_tree_walk(node: Node) -> None: + if isinstance(node, Leaf): + low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold + if node.is_singularity() and node.is_over_threshold(low_threshold): + self.safe_values.add(int(node.actual_intervals[0].min)) + elif isinstance(node, Branch): + for child_node in node.children.values(): + map_tree_walk(child_node) + + map_tree_walk(root) def _generate_float(interval: Interval, rng: Random) -> float: diff --git a/syndiffix/synthesizer.py b/syndiffix/synthesizer.py index 8dbb939..7da46d8 100644 --- a/syndiffix/synthesizer.py +++ b/syndiffix/synthesizer.py @@ -98,6 +98,8 @@ def __init__( ) self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest) + for col_id, converter in enumerate(self.column_convertors): + converter.map_tree(self.forest.get_tree((ColumnId(col_id),))) def sample(self) -> pd.DataFrame: def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]: diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 7a5ecf1..3515229 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -1,16 +1,48 @@ +import string from io import StringIO from random import Random +import numpy as np import pandas as pd import pytest +from syndiffix import Synthesizer from syndiffix.bucket import Bucket from syndiffix.interval import Interval from syndiffix.microdata import * +from .conftest import * + _rng = Random(0) +def _make_safe_values_df() -> pd.DataFrame: + # Each column in this dataframe has 10 instances each of 30 distinct strings. This ensures that distinct string is safe (passes LCF). However since there are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets + columns = ["a", "b", "c"] + values = [] + for i, column in enumerate(columns): + values.append([column + str(x) for x in range(1, 31)]) + values[i] = values[i] * 10 + np.random.shuffle(values[i]) + return pd.DataFrame( + { + columns[0]: values[0], + columns[1]: values[1], + columns[2]: values[2], + } + ) + + +def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None: + # This takes one or more distinct values in each column, and changes every instance to a random value, thus ensuring that some 1dim values will fail LCF, producing non-singularity leafs + def ran_str10() -> str: + return "".join(random.choice(string.ascii_letters) for i in range(10)) + + for column in df.columns: + for value_to_tweak in values_to_tweak: + df[column] = df[column].apply(lambda x: ran_str10() if str(x).endswith(str(value_to_tweak)) else x) + + def _get_convertors(df: pd.DataFrame) -> list[DataConvertor]: return [get_convertor(df, column) for column in df.columns] @@ -164,3 +196,45 @@ def test_empty_bucket_list() -> None: def test_empty_interval_list() -> None: assert generate_microdata([Bucket((), 2)], [], [], _rng) == [[], []] + + +def test_safe_values_set_all() -> None: + data = _make_safe_values_df() + convertors = _get_convertors(data) + results = apply_convertors(convertors, data) + assert results.shape == data.shape + forest = create_forest(results) + for col_id, converter in enumerate(convertors): + converter.map_tree(forest.get_tree((ColumnId(col_id),))) + for col_id, column in enumerate(data.columns): + assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values) + + +def test_safe_values_set_most() -> None: + data = _make_safe_values_df() + nuniques = [data[col].nunique() for col in data.columns] + _tweak_safe_values_df(data) + convertors = _get_convertors(data) + results = apply_convertors(convertors, data) + assert results.shape == data.shape + forest = create_forest(results) + for col_id, converter in enumerate(convertors): + converter.map_tree(forest.get_tree((ColumnId(col_id),))) + for col_id, column in enumerate(data.columns): + assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1 + + +def test_safe_values_e2e_all() -> None: + data = _make_safe_values_df() + syn_data = Synthesizer(data).sample() + for column in syn_data: + assert syn_data[column].apply(lambda x: "*" in str(x)).sum() == 0 + + +def test_safe_values_e2e_some() -> None: + data = _make_safe_values_df() + # By tweaking multiple distinct values, we ensure that there will be buckets with no safe values, thus forcing "*" values + _tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26]) + syn_data = Synthesizer(data).sample() + for column in syn_data: + assert syn_data[column].apply(lambda x: "*" in str(x)).sum() != 0 From 0737ae75e8ca60955607bc4dd2e4a5d2a7f77401 Mon Sep 17 00:00:00 2001 From: yoid2000 Date: Fri, 19 Jan 2024 09:44:33 +0100 Subject: [PATCH 2/4] change map_tree to analyze_tree --- syndiffix/microdata.py | 11 +++++------ syndiffix/synthesizer.py | 2 +- tests/test_microdata.py | 15 ++++++++++----- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index 6e9d936..df34b73 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -43,7 +43,7 @@ def to_float(self, value: Value) -> float: def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: pass - def map_tree(self, root: Node) -> None: + def analyze_tree(self, root: Node) -> None: pass @@ -133,23 +133,22 @@ def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue: if value in self.safe_values: return (self.value_map[value], float(value)) else: - print(f"No safe value for {float(value)}") return ( commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value), float(value), ) - def map_tree(self, root: Node) -> None: - def map_tree_walk(node: Node) -> None: + def analyze_tree(self, root: Node) -> None: + def analyze_tree_walk(node: Node) -> None: if isinstance(node, Leaf): low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold if node.is_singularity() and node.is_over_threshold(low_threshold): self.safe_values.add(int(node.actual_intervals[0].min)) elif isinstance(node, Branch): for child_node in node.children.values(): - map_tree_walk(child_node) + analyze_tree_walk(child_node) - map_tree_walk(root) + analyze_tree_walk(root) def _generate_float(interval: Interval, rng: Random) -> float: diff --git a/syndiffix/synthesizer.py b/syndiffix/synthesizer.py index 7da46d8..b0781f6 100644 --- a/syndiffix/synthesizer.py +++ b/syndiffix/synthesizer.py @@ -99,7 +99,7 @@ def __init__( self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest) for col_id, converter in enumerate(self.column_convertors): - converter.map_tree(self.forest.get_tree((ColumnId(col_id),))) + converter.analyze_tree(self.forest.get_tree((ColumnId(col_id),))) def sample(self) -> pd.DataFrame: def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]: diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 3515229..48f3c6d 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -17,7 +17,9 @@ def _make_safe_values_df() -> pd.DataFrame: - # Each column in this dataframe has 10 instances each of 30 distinct strings. This ensures that distinct string is safe (passes LCF). However since there are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets + # Each column in this dataframe has 10 instances each of 30 distinct strings. + # This ensures that distinct string is safe (passes LCF). However since there + # are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets columns = ["a", "b", "c"] values = [] for i, column in enumerate(columns): @@ -34,7 +36,9 @@ def _make_safe_values_df() -> pd.DataFrame: def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None: - # This takes one or more distinct values in each column, and changes every instance to a random value, thus ensuring that some 1dim values will fail LCF, producing non-singularity leafs + # This takes one or more distinct values in each column, and changes every + # instance to a random value, thus ensuring that some 1dim values will + # fail LCF, producing non-singularity leafs def ran_str10() -> str: return "".join(random.choice(string.ascii_letters) for i in range(10)) @@ -205,7 +209,7 @@ def test_safe_values_set_all() -> None: assert results.shape == data.shape forest = create_forest(results) for col_id, converter in enumerate(convertors): - converter.map_tree(forest.get_tree((ColumnId(col_id),))) + converter.analyze_tree(forest.get_tree((ColumnId(col_id),))) for col_id, column in enumerate(data.columns): assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values) @@ -219,7 +223,7 @@ def test_safe_values_set_most() -> None: assert results.shape == data.shape forest = create_forest(results) for col_id, converter in enumerate(convertors): - converter.map_tree(forest.get_tree((ColumnId(col_id),))) + converter.analyze_tree(forest.get_tree((ColumnId(col_id),))) for col_id, column in enumerate(data.columns): assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1 @@ -233,7 +237,8 @@ def test_safe_values_e2e_all() -> None: def test_safe_values_e2e_some() -> None: data = _make_safe_values_df() - # By tweaking multiple distinct values, we ensure that there will be buckets with no safe values, thus forcing "*" values + # By tweaking multiple distinct values, we ensure that there will be buckets + # with no safe values, thus forcing "*" values _tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26]) syn_data = Synthesizer(data).sample() for column in syn_data: From 507d464817f119eefad631a6dde372418d38415c Mon Sep 17 00:00:00 2001 From: yoid2000 Date: Fri, 19 Jan 2024 09:47:57 +0100 Subject: [PATCH 3/4] FIxed over-long comment --- syndiffix/microdata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index df34b73..a44715f 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -125,7 +125,10 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: return self._map_interval(interval, rng) def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - # If a randomly selected value from the interval is not safe, finds a common prefix of the strings encoded as interval boundaries and appends "*" and a random number to ensure that the count of distinct values approximates that in the original data. + # If a randomly selected value from the interval is not safe, finds a + # common prefix of the strings encoded as interval boundaries and + # appends "*" and a random number to ensure that the count of distinct + # values approximates that in the original data. min_value = int(interval.min) # max_value is inclusive max_value = min(int(interval.max) - 1, len(self.value_map) - 1) From 652bdc22a870f1ddcc999420c6d8832d5dee7d3f Mon Sep 17 00:00:00 2001 From: yoid2000 Date: Fri, 19 Jan 2024 10:16:19 +0100 Subject: [PATCH 4/4] Update algorithm.md --- docs/algorithm.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/algorithm.md b/docs/algorithm.md index 7065604..a79a1e7 100644 --- a/docs/algorithm.md +++ b/docs/algorithm.md @@ -106,13 +106,15 @@ and its snapped ranges elsewhere. - For each harvested bucket: - For 1..bucket.count: - For each range in the bucket: - - Generate a random float value inside the range. + - Generate a random float value inside the range. In the case of a **string** column, the upper edge of the range must not exceed the distinct values sorted array. - Cast the generated value back to the corresponding column type. - In the case of **string** columns: - If the range is a singularity: - Return the exact value. - Else: - - Return the common prefix of the range, plus `*`, plus a random integer from inside the range. + - If the random value is a singularity in the corresponding 1-dimensional tree and passes LCF, return the exact value. + - Else: + - Return the common prefix of the range, plus `*`, plus a random integer from inside the range. - Return a synthetic row from the generated values. ### Clustering