Skip to content

Commit

Permalink
Update Hashing logic in the Anonymiser (#190)
Browse files Browse the repository at this point in the history
* Update first name and last name logic

* Add get_faker_rng fn

* Add user_id to transform

* add fake email and company name logic

update lock file

Update company name and email fn

add test for first name with id hash

* pass user_id to row parser file

get id from table instead of transformer arg

rename arg to id

* Add hashing transformers

Get id from transformer arg

* Get id from transformer arg

Revert fake_full_name fn

Update row_parser

Update hashed_first_name test

* Combine fake and hash fns

* Pass id None in Fake transformer type

* Rename TransformerType from hashed to deterministic

* Update fake_company_name_with_unique_arg test

* Use same transformer fns with deterministic flag

* rename id-column to id_column

* Move deterministic validation on higher level

---------

Co-authored-by: Meedaxa Ahmed <meedaxa.ahmed@gmail.com>
  • Loading branch information
aishwaryavora and meeday authored Feb 27, 2025
1 parent dab0581 commit 4ee539e
Show file tree
Hide file tree
Showing 9 changed files with 1,017 additions and 477 deletions.
769 changes: 386 additions & 383 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mimalloc = "0.1.43"
log = "0.4.22"
zstd = "0.13.2"
colored = "2.1.0"
sha2 = "0.10"

[dev-dependencies]
pretty_assertions = "1.4.0"
32 changes: 29 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ Transforming table data requires a list of all table columns with a transformer
- FakeCompanyName * - Random Company Name from [faker](https://github.com/cksac/fake-rs)
- FakeEmail * - Random email address from [faker](https://github.com/cksac/fake-rs)
- FakeEmailOrPhone * - Either a random phone number OR a random email depending on whether the existing data starts with a `+` and doesn't contain an `@` symbol or not!
- FakeFirstName - Random first name from [faker](https://github.com/cksac/fake-rs)
- FakeFirstName - Random first name from [faker](https://github.com/cksac/fake-rs). Supports deterministic generation by setting `deterministic: true` and providing an `id_column` argument
- FakeFullAddress - Random address made up of segments from [faker](https://github.com/cksac/fake-rs)
- FakeFullName - Random first plus last name from [faker](https://github.com/cksac/fake-rs)
- FakeFullName - Random first plus last name from [faker](https://github.com/cksac/fake-rs). Supports deterministic generation by setting `deterministic: true` and providing an `id_column` argument
- FakeIPv4 - Random IPV4 address from [faker](https://github.com/cksac/fake-rs)
- FakeLastName - Random last name from [faker](https://github.com/cksac/fake-rs)
- FakeLastName- Random last name from [faker](https://github.com/cksac/fake-rs). Supports deterministic generation by setting `deterministic: true` and providing an `id_column` argument
- FakeNationalIdentityNumber - Random National Insurance number from list of dummy numbers
- FakePhoneNumber - Random phone number (looks at existing numbers country code, supports GB + US)
- FakePostCode - Truncates postcode to the first 3 chars e.g. NW5
Expand Down Expand Up @@ -121,3 +121,29 @@ Transformers with a * support the arg `unique` which will append an incrementing
}
},
```

Transformers with a † support deterministic generation by setting `deterministic: true` and providing an `id_column` argument. This ensures the same input and ID always generate the same fake data.

Example of deterministic name generation:
```json
{
"data_category": "Pii",
"description": "user's first name",
"name": "first_name",
"transformer": {
"name": "FakeFirstName",
"args": {
"deterministic": "true",
"id_column": "user_account_id"
}
}
}
```

When using deterministic mode:
- The same input value and ID will always generate the same fake name
- The `id_column` must reference a valid ID column in the same table (e.g., "user_id", "user_account_id", "registrant_id" etc)
- If `deterministic` is true but the specified ID column is missing or invalid, the transformer will raise an error
- Different IDs will generate different names, even for the same input value

This is useful when you need consistent fake names across multiple database dumps or when maintaining referential integrity between tables.
94 changes: 53 additions & 41 deletions src/fixers/fixer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ use crate::parsers::strategy_errors::StrategyFileError;
use crate::parsers::strategy_file;

pub fn can_fix(error: &StrategyFileError) -> bool {
match error {
StrategyFileError::ValidationError(validation_error) => {
match *error {
StrategyFileError::ValidationError(ref validation_error) => {
!validation_error.duplicate_columns.is_empty()
}
StrategyFileError::DbMismatchError(db_mismatch_error) => {
StrategyFileError::DbMismatchError(ref db_mismatch_error) => {
!db_mismatch_error.missing_from_strategy_file.is_empty()
|| !db_mismatch_error.missing_from_db.is_empty()
}
Expand Down Expand Up @@ -35,13 +35,13 @@ pub fn fix(strategy_file: &str, error: StrategyFileError) {
let current_file_contents = strategy_file::read(strategy_file).unwrap_or_else(|_| Vec::new());
match error {
StrategyFileError::ValidationError(validation_error) => {
let new_file_contents = validation::fix(current_file_contents, validation_error);
let new_file_contents = validation::fix(current_file_contents, *validation_error);

strategy_file::write(strategy_file, new_file_contents)
.expect("Unable to write to file :(");
}
StrategyFileError::DbMismatchError(db_mismatch_error) => {
let new_file_contents = db_mismatch::fix(current_file_contents, db_mismatch_error);
let new_file_contents = db_mismatch::fix(current_file_contents, *db_mismatch_error);

strategy_file::write(strategy_file, new_file_contents)
.expect("Unable to write to file :(");
Expand All @@ -61,57 +61,66 @@ mod tests {

#[test]
fn cannot_fix_db_mismatch_error_if_no_missing_columns() {
assert!(!can_fix(&StrategyFileError::DbMismatchError(DbErrors {
missing_from_db: Vec::new(),
missing_from_strategy_file: Vec::new(),
})));
assert!(!can_fix(&StrategyFileError::DbMismatchError(Box::new(
DbErrors {
missing_from_db: Vec::new(),
missing_from_strategy_file: Vec::new(),
}
))));
}
#[test]
fn can_fix_db_mismatch_error_if_missing_from_db_and_strategy() {
assert!(can_fix(&StrategyFileError::DbMismatchError(DbErrors {
missing_from_db: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
missing_from_strategy_file: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
})));
assert!(can_fix(&StrategyFileError::DbMismatchError(Box::new(
DbErrors {
missing_from_db: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
missing_from_strategy_file: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
}
))));
}

#[test]
fn can_fix_db_mismatch_error_if_missing_from_db_only() {
assert!(can_fix(&StrategyFileError::DbMismatchError(DbErrors {
missing_from_db: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
missing_from_strategy_file: Vec::new(),
})));
assert!(can_fix(&StrategyFileError::DbMismatchError(Box::new(
DbErrors {
missing_from_db: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
missing_from_strategy_file: Vec::new(),
}
))));
}

#[test]
fn can_fix_db_mismatch_error_if_missing_from_strategy_file_only() {
assert!(can_fix(&StrategyFileError::DbMismatchError(DbErrors {
missing_from_db: Vec::new(),
missing_from_strategy_file: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
})));
assert!(can_fix(&StrategyFileError::DbMismatchError(Box::new(
DbErrors {
missing_from_db: Vec::new(),
missing_from_strategy_file: vec![SimpleColumn {
column_name: "column".to_string(),
table_name: "table".to_string()
}],
}
))));
}
#[test]
fn cannot_fix_validation_error_if_no_errors() {
assert!(!can_fix(&StrategyFileError::ValidationError(
assert!(!can_fix(&StrategyFileError::ValidationError(Box::new(
ValidationErrors {
unknown_data_categories: Vec::new(),
error_transformer_types: Vec::new(),
unanonymised_pii: Vec::new(),
duplicate_columns: Vec::new(),
duplicate_tables: Vec::new(),
deterministic_without_id: Vec::new(),
}
)));
))));
}

#[test]
Expand All @@ -120,15 +129,16 @@ mod tests {
column_name: "column".to_string(),
table_name: "table".to_string(),
}];
assert!(!can_fix(&StrategyFileError::ValidationError(
assert!(!can_fix(&StrategyFileError::ValidationError(Box::new(
ValidationErrors {
unknown_data_categories: error.clone(),
error_transformer_types: error.clone(),
unanonymised_pii: error,
duplicate_columns: Vec::new(),
duplicate_tables: Vec::new(),
deterministic_without_id: Vec::new(),
}
)));
))));
}

#[test]
Expand All @@ -137,28 +147,30 @@ mod tests {
table_name: "table_name".to_string(),
column_name: "column".to_string(),
}];
assert!(can_fix(&StrategyFileError::ValidationError(
assert!(can_fix(&StrategyFileError::ValidationError(Box::new(
ValidationErrors {
unknown_data_categories: Vec::new(),
error_transformer_types: Vec::new(),
unanonymised_pii: Vec::new(),
duplicate_columns: error,
duplicate_tables: Vec::new(),
deterministic_without_id: Vec::new(),
}
)));
))));
}

#[test]
fn cannot_currently_fix_duplicate_tables() {
let error = vec!["table_name".to_string()];
assert!(!can_fix(&StrategyFileError::ValidationError(
assert!(!can_fix(&StrategyFileError::ValidationError(Box::new(
ValidationErrors {
unknown_data_categories: Vec::new(),
error_transformer_types: Vec::new(),
unanonymised_pii: Vec::new(),
duplicate_columns: Vec::new(),
duplicate_tables: error,
deterministic_without_id: Vec::new(),
}
)));
))));
}
}
14 changes: 9 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ fn main() -> Result<(), std::io::Error> {
Err(err) => {
println!("{}", err);
println!("Ok! lets try and fix some of this!");
fixer::fix(&strategy_file, err);
fixer::fix(&strategy_file, *err);
println!("All done, you probably want to run \"check-strategies\" again to make sure");
}
},
Expand All @@ -116,7 +116,7 @@ fn main() -> Result<(), std::io::Error> {
Ok(()) => println!("All up to date"),
Err(err) => {
if fixer::can_fix(&err) {
fixer::fix(&strategy_file, err);
fixer::fix(&strategy_file, *err);
println!("All done, you'll need to set a data_type and transformer for those fields");
}
std::process::exit(1);
Expand Down Expand Up @@ -151,16 +151,20 @@ fn read_strategy_file(strategy_file: &str, db_url: &str) -> Result<Vec<StrategyI
fn strategy_differences(
strategies: Vec<StrategyInFile>,
db_url: String,
) -> Result<(), StrategyFileError> {
) -> Result<(), Box<StrategyFileError>> {
let transformer = TransformerOverrides::none();
let parsed_strategies = Strategies::from_strategies_in_file(strategies, &transformer)?;
let parsed_strategies = Strategies::from_strategies_in_file(strategies, &transformer)
.map_err(|e| Box::new(StrategyFileError::ValidationError(Box::new(*e))))?;

let builder = TlsConnector::builder();
let connector =
MakeTlsConnector::new(builder.build().expect("should be able to create builder!"));

let mut client = postgres::Client::connect(&db_url, connector).expect("expected to connect!");
let db_columns = db_schema::parse(&mut client);
parsed_strategies.validate_against_db(db_columns)?;
parsed_strategies
.validate_against_db(db_columns)
.map_err(|e| Box::new(StrategyFileError::DbMismatchError(Box::new(e))))?;
Ok(())
}

Expand Down
12 changes: 10 additions & 2 deletions src/parsers/row_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ fn transform_row_with_columns(
columns: &[ColumnInfo],
types: &Types,
) -> String {
let column_values = data_row::split(line);
let column_values: Vec<String> = data_row::split(line).map(|s| s.to_string()).collect();

let mut transformed = column_values.enumerate().map(|(i, value)| {
let mut transformed = column_values.iter().enumerate().map(|(i, value)| {
let current_column = &columns[i];
let column_type = types
//TODO this lookup, we do a double hashmap lookup for every column... already know the
Expand All @@ -137,12 +137,20 @@ fn transform_row_with_columns(
)
});

// Create a vector of (column_name, value) pairs
let column_name_values: Vec<(String, String)> = columns
.iter()
.zip(column_values.iter())
.map(|(col, val)| (col.name.clone(), val.clone()))
.collect();

transformer::transform(
rng,
value,
column_type,
&current_column.transformer,
table_name,
&column_name_values,
)
});

Expand Down
Loading

0 comments on commit 4ee539e

Please sign in to comment.