Skip to content

Commit

Permalink
Merge pull request #131 from diffix/paul-safe-values
Browse files Browse the repository at this point in the history
Added check for safe string values
  • Loading branch information
yoid2000 authored Jan 19, 2024
2 parents 0b2204a + 652bdc2 commit 3a463e8
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ __pycache__/

# Ignore logs
*.log

# Ignore vscode workspace virtual environment
/venv/
6 changes: 4 additions & 2 deletions docs/algorithm.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,15 @@ and its snapped ranges elsewhere.
- For each harvested bucket:
- For 1..bucket.count:
- For each range in the bucket:
- Generate a random float value inside the range.
- Generate a random float value inside the range. In the case of a **string** column, the upper edge of the range must not exceed the distinct values sorted array.
- Cast the generated value back to the corresponding column type.
- In the case of **string** columns:
- If the range is a singularity:
- Return the exact value.
- Else:
- Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
- If the random value is a singularity in the corresponding 1-dimensional tree and passes LCF, return the exact value.
- Else:
- Return the common prefix of the range, plus `*`, plus a random integer from inside the range.
- Return a synthetic row from the generated values.

### Clustering
Expand Down
39 changes: 32 additions & 7 deletions syndiffix/microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .bucket import Buckets
from .common import ColumnType, Value
from .interval import Interval, Intervals
from .tree import Branch, Leaf, Node

MICRODATA_SYN_VALUE: Literal[0] = 0
MICRODATA_FLOAT_VALUE: Literal[1] = 1
Expand All @@ -42,6 +43,9 @@ def to_float(self, value: Value) -> float:
def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
pass

def analyze_tree(self, root: Node) -> None:
pass


class BooleanConvertor(DataConvertor):
def column_type(self) -> ColumnType:
Expand Down Expand Up @@ -104,6 +108,7 @@ def __init__(self, values: Iterable[Value]) -> None:
if not isinstance(value, str):
raise TypeError(f"Not a `str` object in a string dtype column: {value}.")
self.value_map = sorted(cast(Set[str], unique_values))
self.safe_values: Set[int] = set()

def column_type(self) -> ColumnType:
return ColumnType.STRING
Expand All @@ -120,13 +125,33 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
return self._map_interval(interval, rng)

def _map_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
# Finds a common prefix of the strings encoded as interval boundaries and appends "*"
# and a random number to ensure that the count of distinct values approximates that in the original data.
min_value = self.value_map[int(interval.min)]
max_value = self.value_map[min(int(interval.max), len(self.value_map) - 1)]
value = int(_generate_float(interval, rng))

return (commonprefix([min_value, max_value]) + "*" + str(value), float(value))
# If a randomly selected value from the interval is not safe, finds a
# common prefix of the strings encoded as interval boundaries and
# appends "*" and a random number to ensure that the count of distinct
# values approximates that in the original data.
min_value = int(interval.min)
# max_value is inclusive
max_value = min(int(interval.max) - 1, len(self.value_map) - 1)
value = rng.randint(min_value, max_value)
if value in self.safe_values:
return (self.value_map[value], float(value))
else:
return (
commonprefix([self.value_map[min_value], self.value_map[max_value]]) + "*" + str(value),
float(value),
)

def analyze_tree(self, root: Node) -> None:
def analyze_tree_walk(node: Node) -> None:
if isinstance(node, Leaf):
low_threshold = node.context.anonymization_context.anonymization_params.low_count_params.low_threshold
if node.is_singularity() and node.is_over_threshold(low_threshold):
self.safe_values.add(int(node.actual_intervals[0].min))
elif isinstance(node, Branch):
for child_node in node.children.values():
analyze_tree_walk(child_node)

analyze_tree_walk(root)


def _generate_float(interval: Interval, rng: Random) -> float:
Expand Down
2 changes: 2 additions & 0 deletions syndiffix/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def __init__(
)

self.clusters, self.entropy_1dim = clustering.build_clusters(self.forest)
for col_id, converter in enumerate(self.column_convertors):
converter.analyze_tree(self.forest.get_tree((ColumnId(col_id),)))

def sample(self) -> pd.DataFrame:
def materialize_tree(forest: Forest, columns: list[ColumnId]) -> tuple[list[MicrodataRow], Combination]:
Expand Down
79 changes: 79 additions & 0 deletions tests/test_microdata.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,52 @@
import string
from io import StringIO
from random import Random

import numpy as np
import pandas as pd
import pytest

from syndiffix import Synthesizer
from syndiffix.bucket import Bucket
from syndiffix.interval import Interval
from syndiffix.microdata import *

from .conftest import *

_rng = Random(0)


def _make_safe_values_df() -> pd.DataFrame:
# Each column in this dataframe has 10 instances each of 30 distinct strings.
# This ensures that distinct string is safe (passes LCF). However since there
# are 3^30 possible 3dim combinations, there won't be any singluarity 3dim buckets
columns = ["a", "b", "c"]
values = []
for i, column in enumerate(columns):
values.append([column + str(x) for x in range(1, 31)])
values[i] = values[i] * 10
np.random.shuffle(values[i])
return pd.DataFrame(
{
columns[0]: values[0],
columns[1]: values[1],
columns[2]: values[2],
}
)


def _tweak_safe_values_df(df: pd.DataFrame, values_to_tweak: list[int] = [29]) -> None:
# This takes one or more distinct values in each column, and changes every
# instance to a random value, thus ensuring that some 1dim values will
# fail LCF, producing non-singularity leafs
def ran_str10() -> str:
return "".join(random.choice(string.ascii_letters) for i in range(10))

for column in df.columns:
for value_to_tweak in values_to_tweak:
df[column] = df[column].apply(lambda x: ran_str10() if str(x).endswith(str(value_to_tweak)) else x)


def _get_convertors(df: pd.DataFrame) -> list[DataConvertor]:
return [get_convertor(df, column) for column in df.columns]

Expand Down Expand Up @@ -164,3 +200,46 @@ def test_empty_bucket_list() -> None:

def test_empty_interval_list() -> None:
assert generate_microdata([Bucket((), 2)], [], [], _rng) == [[], []]


def test_safe_values_set_all() -> None:
data = _make_safe_values_df()
convertors = _get_convertors(data)
results = apply_convertors(convertors, data)
assert results.shape == data.shape
forest = create_forest(results)
for col_id, converter in enumerate(convertors):
converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
for col_id, column in enumerate(data.columns):
assert data[column].nunique() == len(cast(StringConvertor, convertors[col_id]).safe_values)


def test_safe_values_set_most() -> None:
data = _make_safe_values_df()
nuniques = [data[col].nunique() for col in data.columns]
_tweak_safe_values_df(data)
convertors = _get_convertors(data)
results = apply_convertors(convertors, data)
assert results.shape == data.shape
forest = create_forest(results)
for col_id, converter in enumerate(convertors):
converter.analyze_tree(forest.get_tree((ColumnId(col_id),)))
for col_id, column in enumerate(data.columns):
assert nuniques[col_id] == len(cast(StringConvertor, convertors[col_id]).safe_values) + 1


def test_safe_values_e2e_all() -> None:
data = _make_safe_values_df()
syn_data = Synthesizer(data).sample()
for column in syn_data:
assert syn_data[column].apply(lambda x: "*" in str(x)).sum() == 0


def test_safe_values_e2e_some() -> None:
data = _make_safe_values_df()
# By tweaking multiple distinct values, we ensure that there will be buckets
# with no safe values, thus forcing "*" values
_tweak_safe_values_df(data, [20, 21, 22, 23, 24, 25, 26])
syn_data = Synthesizer(data).sample()
for column in syn_data:
assert syn_data[column].apply(lambda x: "*" in str(x)).sum() != 0

0 comments on commit 3a463e8

Please sign in to comment.