diff --git a/.travis.yml b/.travis.yml index c2798e76..9ac53b1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ install: - travis_retry pip install -e . script: - - if [ "${INCLUDE_NB_TEST}" == "1" ]; then pytest --cov=clkhash --nbval-lax; else pytest --cov=clkhash; fi + - pytest --cov=clkhash - codecov @@ -48,28 +48,33 @@ jobs: - python: '3.6' env: - INCLUDE_CLI=1 - - INCLUDE_NB_TEST=1 - python: '2.7' env: - INCLUDE_CLI=1 - # OSX + Python is officially supported by Travis CI as of April 2011 # https://docs.travis-ci.com/user/reference/osx/ - os: osx osx_image: xcode8.3 python: "3.6-dev" + - stage: Integration + name: Test Notebooks + python: 3.7 + before_install: + - travis_retry pip install -U -r docs/doc-requirements.txt + script: + - pytest --nbval docs -x --sanitize-with docs/tutorial_sanitize.cfg + - stage: Integration python: '3.8-dev' env: - TEST_ENTITY_SERVICE=https://testing.es.data61.xyz - INCLUDE_CLI=1 - stage: Integration - python: '3.6' + python: '3.7' env: - TEST_ENTITY_SERVICE=https://testing.es.data61.xyz - INCLUDE_CLI=1 - - INCLUDE_NB_TEST=1 - stage: Integration python: '2.7' env: diff --git a/CHANGELOG.md b/CHANGELOG.md index b5604fa2..730808d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.13.0 +- Fix example and test linkage schemas using v2. - Fix mismatch between double hash and blake hash key requirement. - Update to use newer anonlink-entity-service api. - Updates to dependencies. diff --git a/clkhash/__init__.py b/clkhash/__init__.py index 13c74beb..f6b1e9fa 100644 --- a/clkhash/__init__.py +++ b/clkhash/__init__.py @@ -1,10 +1,11 @@ import pkg_resources from . import bloomfilter, field_formats, key_derivation, schema, randomnames, describe +from .schema import Schema try: __version__ = pkg_resources.get_distribution('clkhash').version except pkg_resources.DistributionNotFound: __version__ = "development" -__author__ = 'N1 Analytics' +__author__ = "Data61" diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py index eb5b3e4a..d3f55382 100644 --- a/clkhash/bloomfilter.py +++ b/clkhash/bloomfilter.py @@ -319,9 +319,10 @@ def crypto_bloom_filter(record, # type: Sequence[Text] if fhp: ngrams = list(tokenize(field.format_value(entry))) hash_function = hashing_function_from_properties(fhp) - bloomfilter |= hash_function(ngrams, key, - fhp.ks(len(ngrams)), - hash_l, fhp.encoding) + if ngrams: + bloomfilter |= hash_function(ngrams, key, + fhp.ks(len(ngrams)), + hash_l, fhp.encoding) c1 = bloomfilter.count() bloomfilter = fold_xor(bloomfilter, schema.xor_folds) diff --git a/clkhash/cli.py b/clkhash/cli.py index c8c04b58..6cd8b547 100644 --- a/clkhash/cli.py +++ b/clkhash/cli.py @@ -15,6 +15,7 @@ run_get_status, project_create, run_create, server_get_status, ServiceError, format_run_status, watch_run_status) +from clkhash.schema import SchemaError DEFAULT_SERVICE_URL = 'https://es.data61.xyz' @@ -68,8 +69,11 @@ def hash(pii_csv, keys, schema, clk_json, quiet, no_header, check_header, valida Use "-" for CLK_JSON to write JSON to stdout. """ - - schema_object = clkhash.schema.from_json_file(schema_file=schema) + try: + schema_object = clkhash.schema.from_json_file(schema_file=schema) + except SchemaError as e: + log(str(e)) + raise SystemExit(-1) header = True if not check_header: header = 'ignore' @@ -92,7 +96,7 @@ def hash(pii_csv, keys, schema, clk_json, quiet, no_header, check_header, valida log("CLK data written to {}".format(clk_json.name)) -@cli.command('status', short_help='Get status of entity service') +@cli.command('status', short_help='get status of entity service') @click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol") @click.option('-o', '--output', type=click.File('w'), default='-') @click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative") @@ -141,7 +145,7 @@ def status(server, output, verbose): @click.option('--name', type=str, help="Name to give this project") @click.option('--parties', default=2, type=int, help="Number of parties in the project") -@click.option('-o','--output', type=click.File('w'), default='-') +@click.option('-o', '--output', type=click.File('w'), default='-') @click.option('-v', '--verbose', is_flag=True, help="Script is more talkative") def create_project(type, schema, server, name, parties, output, verbose): """Create a new project on an entity matching server. @@ -171,7 +175,7 @@ def create_project(type, schema, server, name, parties, output, verbose): except ServiceError as e: log("Unexpected response - {}".format(e.status_code)) log(e.text) - raise SystemExit + raise SystemExit(-1) else: log("Project created") @@ -318,6 +322,28 @@ def generate_default_schema(output): shutil.copyfile(original_path, output) +@cli.command('validate-schema', short_help="validate linkage schema") +@click.argument('schema', type=click.File('r', lazy=True)) +def validate_schema(schema): + """Validate a linkage schema + + Given a file containing a linkage schema, verify the schema is valid otherwise + print detailed errors. + """ + + try: + clkhash.schema.from_json_file( + schema_file=schema, + validate=True + ) + + log("schema is valid", color='green') + + except SchemaError as e: + log(str(e)) + raise SystemExit(-1) + + if __name__ == "__main__": freeze_support() cli() diff --git a/clkhash/data/randomnames-schema.json b/clkhash/data/randomnames-schema.json index 2a506d15..0986073a 100644 --- a/clkhash/data/randomnames-schema.json +++ b/clkhash/data/randomnames-schema.json @@ -1,4 +1,3 @@ - { "version": 1, "clkConfig": { diff --git a/clkhash/field_formats.py b/clkhash/field_formats.py index d34c9cc9..b5ff26f5 100644 --- a/clkhash/field_formats.py +++ b/clkhash/field_formats.py @@ -25,11 +25,13 @@ class InvalidEntryError(ValueError): class InvalidSchemaError(ValueError): - """ The schema is not valid. + """Raised if the schema of a field specification is invalid. - This exception is raised if, for example, a regular expression - included in the schema is not syntactically correct. + For example, a regular expression included in the schema is not + syntactically correct. """ + json_field_spec = None # type: Optional[dict] + field_spec_index = None # type: Optional[int] class MissingValueSpec(object): @@ -161,19 +163,17 @@ def fhp_from_json_dict( """ Make a :class:`FieldHashingProperties` object from a dictionary. - :param dict json_dict: - The dictionary must have have an 'ngram' key - and one of k or num_bits. It may have - 'positional' key; if missing a default is used. - The encoding is - always set to the default value. - :return: A :class:`FieldHashingProperties` instance. + :param dict json_dict: + Conforming to the `hashingConfig` definition + in the `v2` linkage schema. + :return: A :class:`FieldHashingProperties` instance. """ + hashing_strategy = json_dict['strategy'] h = json_dict.get('hash', {'type': 'blakeHash'}) - num_bits = json_dict.get('numBits') - k = json_dict.get('k') - if not num_bits and not k: - num_bits = 200 # default for v2 schema + + num_bits = hashing_strategy.get('numBits') + k = hashing_strategy.get('k') + return FieldHashingProperties( ngram=json_dict['ngram'], positional=json_dict.get( @@ -263,7 +263,6 @@ def validate(self, str_in): e_new.field_spec = self raise_from(e_new, err) - def is_missing_value(self, str_in): # type: (Text) -> bool """ tests if 'str_in' is the sentinel value for this field @@ -441,6 +440,7 @@ def from_json_dict(cls, except (SyntaxError, re.error) as e: msg = "Invalid regular expression '{}.'".format(pattern) e_new = InvalidSchemaError(msg) + e_new.json_field_spec = json_dict raise_from(e_new, e) result.regex_based = True @@ -843,9 +843,10 @@ def spec_from_json_dict( json_dict # type: Dict[str, Any] ): # type: (...) -> FieldSpec - """ Turns a dictionary into the appropriate object. + """ Turns a dictionary into the appropriate FieldSpec object. :param dict json_dict: A dictionary with properties. + :raises InvalidSchemaError: :returns: An initialised instance of the appropriate FieldSpec subclass. """ diff --git a/clkhash/schema.py b/clkhash/schema.py index 30ac40af..d8fd4d8f 100644 --- a/clkhash/schema.py +++ b/clkhash/schema.py @@ -15,7 +15,7 @@ from future.builtins import map from clkhash.backports import raise_from -from clkhash.field_formats import FieldSpec, spec_from_json_dict +from clkhash.field_formats import FieldSpec, spec_from_json_dict, InvalidSchemaError from clkhash.key_derivation import DEFAULT_KEY_SIZE as DEFAULT_KDF_KEY_SIZE MASTER_SCHEMA_FILE_NAMES = {1: 'v1.json', @@ -26,6 +26,23 @@ class SchemaError(Exception): """ The user-defined schema is invalid. """ + def __init__(self, + msg, # type: str + errors=None # type: Optional[Sequence[InvalidSchemaError]] + ): + # type: (...) -> None + self.msg = msg + self.errors = [] if errors is None else errors + super(SchemaError, self).__init__(msg) + + def __str__(self): + detail = "" + for i, e in enumerate(self.errors, start=1): + detail += "Error {} in feature at index {} - {}\n".format(i, e.field_spec_index, str(e)) + detail += "Invalid spec:\n{}\n---\n".format(e.json_field_spec) + + return self.msg + '\n\n' + detail + class MasterSchemaError(Exception): """ Master schema missing? Corrupted? Otherwise surprising? This is @@ -50,7 +67,7 @@ def __init__(self, # type: (...) -> None """ Create a Schema. :param fields: the features or field definitions - :param l: The length of the resulting hash in bits. This is the + :param l: The length of the resulting encoding in bits. This is the length after XOR folding. :param xor_folds: The number of XOR folds to perform on the hash. :param kdf_type: The key derivation function to use. Currently, @@ -87,16 +104,17 @@ def convert_v1_to_v2( :param dict: v1 schema dict :return: v2 schema dict """ + dict = deepcopy(dict) version = dict['version'] if version != 1: raise ValueError('Version {} not 1'.format(version)) clk_config = dict['clkConfig'] - k = clk_config['k'] + k = clk_config.pop('k') clk_hash = clk_config['hash'] def convert_feature(f): - if 'ignored' in f: + if f.get('ignored', False): return f hashing = f['hashing'] @@ -113,7 +131,8 @@ def convert_feature(f): if 'weight' in hashing: del hashing['weight'] - hashing['k'] = int(round(weight * k)) + hashing['strategy'] = {} + hashing['strategy']['k'] = int(round(weight * k)) hashing['hash'] = clk_hash return x @@ -140,6 +159,8 @@ def from_json_dict(dct, validate=True): key with all the globals. :param validate: (default True) Raise an exception if the schema does not conform to the master schema. + :raises SchemaError: An exception containing details about why + the schema is not valid. :return: the Schema """ if validate: @@ -173,8 +194,23 @@ def from_json_dict(dct, validate=True): else None) kdf_key_size = kdf.get('keySize', DEFAULT_KDF_KEY_SIZE) - fields = list(map(spec_from_json_dict, dct['features'])) - return Schema(fields, l, xor_folds, + # Try to parse each feature config and store any errors encountered + # for reporting. + feature_errors = [] + feature_configs = [] + + for i, feature_config in enumerate(dct['features']): + try: + feature_configs.append(spec_from_json_dict(feature_config)) + except InvalidSchemaError as e: + e.field_spec_index = i + e.json_field_spec = feature_config + feature_errors.append(e) + + if len(feature_errors): + raise SchemaError("Schema was invalid", feature_errors) + + return Schema(feature_configs, l, xor_folds, kdf_type, kdf_hash, kdf_info, kdf_salt, kdf_key_size) @@ -199,15 +235,16 @@ def from_json_file(schema_file, validate=True): def _get_master_schema(version): - # type: (Hashable) -> bytes - """ Loads the master schema of given version as bytes. + # type: (Hashable) -> dict + """ Loads the master schema of given version :param version: The version of the master schema whose path we wish to retrieve. :raises SchemaError: When the schema version is unknown. This usually means that either (a) clkhash is out of date, or (b) the schema version listed is incorrect. - :return: Bytes of the schema. + :raises MasterSchemaError: When the master schema is invalid. + :return: Dict object of the (json) master schema. """ try: file_name = MASTER_SCHEMA_FILE_NAMES[version] @@ -218,6 +255,10 @@ def _get_master_schema(version): try: schema_bytes = pkgutil.get_data('clkhash', 'schemas/{}'.format(file_name)) + if schema_bytes is None: + msg = ('The master schema could not be loaded. The schema cannot be ' + 'validated. Please file a bug report.') + raise MasterSchemaError(msg) except IOError as e: # In Python 3 we can be more specific with # FileNotFoundError, but that doesn't exist in # Python 2. @@ -225,12 +266,16 @@ def _get_master_schema(version): 'validated. Please file a bug report.') raise_from(MasterSchemaError(msg), e) - if schema_bytes is None: - msg = ('The master schema could not be loaded. The schema cannot be ' - 'validated. Please file a bug report.') - raise MasterSchemaError(msg) - - return schema_bytes + try: + master_schema = json.loads(schema_bytes.decode('utf-8')) + return master_schema + except ValueError as e: + # In Python 3 we can be more specific with + # json.decoder.JSONDecodeError, but that + # doesn't exist in Python 2. + msg = ('The master schema is not a valid JSON file. The schema cannot ' + 'be validated. Please file a bug report.') + raise_from(MasterSchemaError(msg), e) def validate_schema_dict(schema): @@ -254,20 +299,12 @@ def validate_schema_dict(schema): else: raise SchemaError('A format version is expected in the schema.') - master_schema_bytes = _get_master_schema(version) - try: - master_schema = json.loads(master_schema_bytes.decode('utf-8')) - except ValueError as e: # In Python 3 we can be more specific with - # json.decoder.JSONDecodeError, but that - # doesn't exist in Python 2. - msg = ('The master schema is not a valid JSON file. The schema cannot ' - 'be validated. Please file a bug report.') - raise_from(MasterSchemaError(msg), e) + master_schema = _get_master_schema(version) try: jsonschema.validate(schema, master_schema) except jsonschema.exceptions.ValidationError as e: - raise_from(SchemaError('The schema is not valid.'), e) + raise_from(SchemaError('The schema is not valid.\n\n' + str(e)), e) except jsonschema.exceptions.SchemaError as e: msg = ('The master schema is not valid. The schema cannot be ' 'validated. Please file a bug report.') diff --git a/clkhash/schemas/v1.json b/clkhash/schemas/v1.json index 1a2f2b63..ec2d9fde 100644 --- a/clkhash/schemas/v1.json +++ b/clkhash/schemas/v1.json @@ -1,5 +1,5 @@ { - "id": "https://schema.n1analytics.com/entityservice/hashing-schema.json", + "id": "https://data61.github.io/schemas/clkhash/linkage-schema-v1.json", "$schema": "http://json-schema.org/schema#", "title": "hashing-schema", "type": "object", diff --git a/clkhash/schemas/v2.json b/clkhash/schemas/v2.json index 11eded0a..146e47f8 100644 --- a/clkhash/schemas/v2.json +++ b/clkhash/schemas/v2.json @@ -1,7 +1,7 @@ { - "id": "https://schema.n1analytics.com/entityservice/hashing-schema.json", - "$schema": "http://json-schema.org/schema#", - "title": "hashing-schema", + "$id": "https://data61.github.io/schemas/clkhash/linkage-schema-v2.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "linkage-schema", "type": "object", "required": ["version", "clkConfig", "features"], "properties": { @@ -29,17 +29,17 @@ "default": 1024, "description": "the length of a clk in number of bits" }, - "xorFolds": { - "type": "integer", - "minimum": 0, - "default": 0, - "description": "number of XOR folds. Note that the parameter 'l' describes the length of the clk AFTER the XOR folding!" - }, "kdf": { "type": "object", "oneOf": [ {"$ref": "#/definitions/hkdf"} ] + }, + "xorFolds": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "number of XOR folds. Note that the parameter 'l' describes the length of the clk AFTER the XOR folding!" } } }, @@ -47,7 +47,7 @@ "type": "object", "required": ["type"], "properties": { - "type": {"enum": ["HKDF"]}, + "type": {"type": "string", "const": "HKDF"}, "hash": {"enum": ["SHA256", "SHA512"], "default": "SHA256"}, "salt": {"type": "string", "description": "base64 encoded bytes"}, "info": {"type": "string", "description": "base64 encoded bytes"}, @@ -56,10 +56,12 @@ }, "ignoreFeature": { "type": "object", - "required": ["identifier","ignored"], + "required": ["identifier", "ignored"], "properties": { "identifier": {"type": "string", "description":"semantic meaning of identifier"}, - "ignored": {"enum": [true]}, + "ignored": { + "const": true + }, "description": {"type": "string"} } }, @@ -69,7 +71,9 @@ "properties": { "identifier": {"type": "string", "description":"semantic meaning of identifier"}, "description": {"type": "string"}, - "format": {"oneOf": [ + "format": { + "required": ["type"], + "oneOf": [ {"$ref": "#/definitions/textFormat"}, {"$ref": "#/definitions/textPatternFormat"}, {"$ref": "#/definitions/numberFormat"}, @@ -136,7 +140,8 @@ }, "hashingConfig": { "type": "object", - "required": ["ngram"], + "additionalProperties": false, + "required": ["ngram", "strategy"], "properties": { "hash": {"type": "object", "default": {"type": "blakeHash"}, "oneOf": [ {"$ref": "#/definitions/doubleHash"}, @@ -144,9 +149,26 @@ ], "description": "this construction is for future proofing. We might want to parameterize the hash functions one day..." }, - "strategy": {"type": "object", "default": {"numBits": 200}, "oneOf": [ - {"numBits": {"type": "number", "minimum": 1, "default": 200, "description": "dynamic k = numBits / number of n-grams"}}, - {"k": {"type": "number", "minimum": 1, "default": 20, "description": "max number of bits per n-gram"}} + "strategy": {"type": "object", "oneOf": + [ + { + "required": ["numBits"], + "numBits": { + "type": "number", + "minimum": 1, + "default": 200, + "description": "dynamic k = numBits / number of n-grams" + } + }, + { + "required": ["k"], + "k": { + "type": "number", + "minimum": 1, + "default": 20, + "description": "max number of bits per n-gram" + } + } ]}, "ngram": {"type": "integer", "minimum": 1, "default": 2}, "positional": {"type": "boolean", "default": false}, diff --git a/docs/_static/febrl_schema_v2_final.json b/docs/_static/febrl_schema_v2_final.json new file mode 100644 index 00000000..2c6d196b --- /dev/null +++ b/docs/_static/febrl_schema_v2_final.json @@ -0,0 +1,69 @@ +{ + "version": 2, + "clkConfig": { + "l": 1024, + "kdf": { + "type": "HKDF", + "hash": "SHA256", + "info": "c2NoZW1hX2V4YW1wbGU=", + "salt": "SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==", + "keySize": 64 + } + }, + "features": [ + { + "identifier": "rec_id", + "ignored": true + }, + { + "identifier": "given_name", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "surname", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "street_number", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 100}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "address_1", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 100} } + }, + { + "identifier": "address_2", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 100} } + }, + { + "identifier": "suburb", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 100} } + }, + { + "identifier": "postcode", + "format": { "type": "integer", "minimum": 50, "maximum": 9999 }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 100} } + }, + { + "identifier": "state", + "format": { "type": "string", "encoding": "utf-8"}, + "hashing": {"ngram": 2, "positional": true, "strategy": {"numBits": 100}, "missingValue": {"sentinel": ""} + } + }, + { + "identifier": "date_of_birth", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 200}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "soc_sec_id", + "ignored": true + } + ] +} diff --git a/docs/_static/febrl_schema_v2_overweight.json b/docs/_static/febrl_schema_v2_overweight.json new file mode 100644 index 00000000..e2fb1376 --- /dev/null +++ b/docs/_static/febrl_schema_v2_overweight.json @@ -0,0 +1,68 @@ +{ + "version": 2, + "clkConfig": { + "l": 1024, + "kdf": { + "type": "HKDF", + "hash": "SHA256", + "info": "c2NoZW1hX2V4YW1wbGU=", + "salt": "SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==", + "keySize": 64 + } + }, + "features": [ + { + "identifier": "rec_id", + "ignored": true + }, + { + "identifier": "given_name", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "surname", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "street_number", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 300}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "address_1", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300} } + }, + { + "identifier": "address_2", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300} } + }, + { + "identifier": "suburb", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300} } + }, + { + "identifier": "postcode", + "format": { "type": "integer", "minimum": 100, "maximum": 9999 }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 300} } + }, + { + "identifier": "state", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 3 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 300} } + }, + { + "identifier": "date_of_birth", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 300}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "soc_sec_id", + "ignored": true + } + ] +} diff --git a/docs/_static/febrl_schema_v2_reduced.json b/docs/_static/febrl_schema_v2_reduced.json new file mode 100644 index 00000000..578d4217 --- /dev/null +++ b/docs/_static/febrl_schema_v2_reduced.json @@ -0,0 +1,68 @@ +{ + "version": 2, + "clkConfig": { + "l": 1024, + "kdf": { + "type": "HKDF", + "hash": "SHA256", + "info": "c2NoZW1hX2V4YW1wbGU=", + "salt": "SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==", + "keySize": 64 + } + }, + "features": [ + { + "identifier": "rec_id", + "ignored": true + }, + { + "identifier": "given_name", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "surname", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 64 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200}, "hash": {"type": "doubleHash"} } + }, + { + "identifier": "street_number", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 200}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "address_1", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200} } + }, + { + "identifier": "address_2", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200} } + }, + { + "identifier": "suburb", + "format": { "type": "string", "encoding": "utf-8" }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200} } + }, + { + "identifier": "postcode", + "format": { "type": "integer", "minimum": 100, "maximum": 9999 }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 200} } + }, + { + "identifier": "state", + "format": { "type": "string", "encoding": "utf-8", "maxLength": 3 }, + "hashing": { "ngram": 2, "strategy": {"numBits": 200} } + }, + { + "identifier": "date_of_birth", + "format": { "type": "integer" }, + "hashing": { "ngram": 1, "positional": true, "strategy": {"numBits": 200}, "missingValue": {"sentinel": ""} } + }, + { + "identifier": "soc_sec_id", + "ignored": true + } + ] +} diff --git a/docs/doc-requirements.txt b/docs/doc-requirements.txt index 84f975fb..71144912 100644 --- a/docs/doc-requirements.txt +++ b/docs/doc-requirements.txt @@ -1,3 +1,4 @@ +anonlink>=0.12 sphinx>=1.7 typing>=3.6 nbsphinx>=0.3 diff --git a/docs/schema.rst b/docs/schema.rst index 40161540..22692618 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -1,32 +1,39 @@ .. _schema: -Hashing Schema +Linkage Schema ============== -As CLKs are usually used for privacy preserving linkage, it is important that participating organisations agree on how -raw personally identifiable information is hashed to create the CLKs. +As CLKs are usually used for privacy preserving linkage, it is important that +participating organisations agree on how raw personally identifiable information +is encoded to create the CLKs. The linkage schema allows putting more emphasis on +particular features and provides a basic level of data validation. -We call the configuration of how to create CLKs a *hashing schema*. The organisations agree on one hashing schema -as configuration to ensure that their respective CLKs have been created in the same way. +We call the configuration of how to create CLKs a *linkage schema*. The +organisations agree on a linkage schema to ensure that their respective CLKs have +been created in the same way. -This aims to be an open standard such that different client implementations could take the schema -and create identical CLKS given the same data. +This aims to be an open standard such that different client implementations could +take the schema and create identical CLKs given the same data (and secret keys). -The hashing-schema is a detailed description of exactly what is fed to the hashing operation, -along with any configuration for the hashing itself. +The linkage schema is a detailed description of exactly how to carry out the +encoding operation, along with any configuration for the low level hashing itself. -The format of the hashing schema is defined in a separate ``JSON Schema`` document -`schemas/v1.json `_. +The format of the linkage schema is defined in a separate +`JSON Schema `_ specification document - +`schemas/v2.json `_. + +Earlier versions of the linkage schema will continue to work, internally they +are converted to the latest version (currently ``v2``). Basic Structure --------------- -A hashing schema consists of three parts: +A linkage schema consists of three parts: -* :ref:`version `, contains the version number of the hashing schema -* :ref:`clkConfig `, CLK wide configuration, independent of features -* :ref:`features `, configuration that is specific to the individual features +* :ref:`version `, contains the version number of the hashing schema. +* :ref:`clkConfig `, CLK wide configuration, independent of features. +* :ref:`features `, an array of configuration specific to individual features. Example Schema @@ -35,58 +42,66 @@ Example Schema :: { - "version": 1, + "version": 2, "clkConfig": { "l": 1024, - "k": 20, - "hash": { - "type": "doubleHash" - }, "kdf": { - "type": "HKDF" + "type": "HKDF", + "hash": "SHA256", + "salt": "SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==", + "info": "", + "keySize": 64 } }, "features": [ { - "identifier": "index", + "identifier": "INDEX", "ignored": true }, { - "identifier": "full name", + "identifier": "NAME freetext", "format": { "type": "string", - "maxLength": 30, - "encoding": "utf-8" + "encoding": "utf-8", + "case": "mixed", + "minLength": 3 }, - "hashing": { "ngram": 2 } + "hashing": { + "ngram": 2, + "numBits": 100, + "hash": {"type": "doubleHash"} + } }, { - "identifier": "gender", + "identifier": "DOB YYYY/MM/DD", "format": { - "type": "enum", - "values": ["M", "F", "O"] + "type": "date", + "description": "Numbers separated by slashes, in the year, month, day order", + "format": "%Y/%m/%d" }, - "hashing": { "ngram": 1 } + "hashing": { + "ngram": 1, + "positional": true, + "numBits": 200, + "hash": {"type": "doubleHash"} + } }, { - "identifier": "postcode", + "identifier": "GENDER M or F", "format": { - "type": "integer", - "minimum": 1000, - "maximum": 9999 + "type": "enum", + "values": ["M", "F"] }, - "hashing":{ + "hashing": { "ngram": 1, - "positional": true, - "missingValue": { - "sentinel": "N/A", - "replaceWith": "" - } + "numBits": 400, + "hash": {"type": "doubleHash"} } } ] } + A more advanced example can be found `here <_static/example_schema.json>`_. @@ -111,10 +126,8 @@ Describes the general construction of the CLK. name type optional description ======== ================== ======== =========== l integer no the length of the CLK in bits -k integer no max number of indices per n-gram -xorFolds integer yes number of XOR folds (as proposed in [Schnell2016]_). kdf :ref:`schema/KDF` no defines the key derivation function used to generate individual secrets for each feature derived from the master secret -hash :ref:`schema/Hash` no defines the hashing scheme to encode the n-grams +xorFolds integer yes number of XOR folds (as proposed in [Schnell2016]_). ======== ================== ======== =========== @@ -135,32 +148,6 @@ keySize integer yes size of the generated keys in bytes ======== ======= ======== =========== -.. _schema/Hash: - -Hash -^^^^ -Describes and cofigures the hash that is used to encode the n-grams. - -Choose one of: - -* *double hash*, as described in [Schnell2011]_. - -=================== ======= ======== =========== -name type optional description -=================== ======= ======== =========== -type string no must be set to "doubleHash" -prevent_singularity boolean yes see discussion in https://github.com/data61/clkhash/issues/33 -=================== ======= ======== =========== - -* *blake hash* - -=================== ======= ======== =========== -name type optional description -=================== ======= ======== =========== -type string no must be set to "blakeHash" -=================== ======= ======== =========== - - .. _schema/features: features @@ -188,11 +175,13 @@ description string yes free text, ignored by clkhash featureConfig ~~~~~~~~~~~~~ -A feature is configured in three parts: -* identifier, the name of the feature +Each feature is configured by: + +* identifier, the human readable name. E.g. ``"First Name"``. +* description, a human readable description of this feature. * format, describes the expected format of the values of this feature -* hashing, configures the hashing +* :ref:`hashing `, configures the hashing =========== ===================== ======== =========== name type optional description @@ -218,20 +207,61 @@ hashingConfig name type optional description ============ ====================== ======== =========== ngram integer no specifies the n in n-gram (the tokenization of the input values). +strategy :ref:`schema/strategy` no the strategy for assigning bits to the encoding. positional boolean yes adds the position to the n-grams. String "222" would be tokenized (as uni-grams) to "1 2", "2 2", "3 2" -weight float yes positive number, which adjusts the number of hash functions (k) used for encoding. Thus giving this feature more or less importance compared to others. missingValue :ref:`schema/missingV` yes allows to define how missing values are handled ============ ====================== ======== =========== +.. _schema/strategy: + +strategy +^^^^^^^^ + +An object where either ``numBits`` or ``k`` is defined. + +============ ====================== ======== =========== +name type optional description +============ ====================== ======== =========== +k integer yes max number of indices per n-gram +numBits integer yes max number of indices per feature +============ ====================== ======== =========== + + +.. _schema/Hash: + +Hash +^^^^ +Describes and configures the hash that is used to encode the n-grams. + +Choose one of: + +* *double hash*, as described in [Schnell2011]_. + +=================== ======= ======== =========== +name type optional description +=================== ======= ======== =========== +type string no must be set to "doubleHash" +prevent_singularity boolean yes see discussion in https://github.com/data61/clkhash/issues/33 +=================== ======= ======== =========== + +* *blake hash* (default) + +=================== ======= ======== =========== +name type optional description +=================== ======= ======== =========== +type string no must be set to "blakeHash" +=================== ======= ======== =========== + + .. _schema/missingV: missingValue ^^^^^^^^^^^^^^ + Data sets are not always complete -- they can contain missing values. -If specified, then clkhash will not check the format for these missing values, and will optionally replace them with the -'replaceWith' value. -This can be useful if the data +If specified, then clkhash will not check the format for these missing values, and will optionally replace the ``sentinel`` with the +``replaceWith`` value. =========== ===================== ======== =========== name type optional description diff --git a/docs/tutorial_api.ipynb b/docs/tutorial_api.ipynb index 46efc0a3..eb2cff34 100644 --- a/docs/tutorial_api.ipynb +++ b/docs/tutorial_api.ipynb @@ -2,7 +2,9 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "# Tutorial for Python API\n", "\n", @@ -10,22 +12,19 @@ "\n", "The Python package `recordlinkage` has a [tutorial](http://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html) linking data sets in the clear, we will try duplicate that in a privacy preserving setting.\n", "\n", - "First install clkhash, recordlinkage and a few data science tools (pandas and numpy)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -U clkhash recordlinkage numpy pandas" + "First install clkhash, recordlinkage and a few data science tools (pandas and numpy):\n", + "\n", + " $ pip install -U clkhash anonlink recordlinkage numpy pandas" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ "import io\n", @@ -36,18 +35,38 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ "import clkhash\n", + "from clkhash import clk\n", "from clkhash.field_formats import *\n", + "from clkhash.schema import Schema" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ "import recordlinkage\n", "from recordlinkage.datasets import load_febrl4" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Data Exploration\n", "\n", @@ -56,8 +75,12 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 4, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "data": { @@ -201,7 +224,7 @@ "rec-3585-org 19860208 7207688 " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -214,15 +237,21 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "For this linkage we will **not** use the social security id column." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 5, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "data": { @@ -232,7 +261,7 @@ " dtype='object')" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -243,29 +272,23 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "execution_count": 6, + "metadata": { + "pycharm": { + "is_executing": false } - ], + }, + "outputs": [], "source": [ "a_csv = io.StringIO()\n", - "dfA.to_csv(a_csv)\n", - "a_csv.seek(0)" + "dfA.to_csv(a_csv)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Hashing Schema Definition\n", "\n", @@ -275,152 +298,161 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 7, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ - "schema = clkhash.randomnames.NameList.SCHEMA\n", - "\n", - "schema.fields = [\n", + "fields = [\n", " Ignore('rec_id'),\n", - " StringSpec('given_name', FieldHashingProperties(ngram=2, k=30)),\n", - " StringSpec('surname', FieldHashingProperties(ngram=2, k=30)),\n", - " IntegerSpec('street_number', FieldHashingProperties(ngram=1, positional=True, k=30, missing_value=MissingValueSpec(sentinel=''))),\n", - " StringSpec('address_1', FieldHashingProperties(ngram=2, k=30)),\n", - " StringSpec('address_2', FieldHashingProperties(ngram=2, k=30)),\n", - " StringSpec('suburb', FieldHashingProperties(ngram=2, k=30)),\n", - " IntegerSpec('postcode', FieldHashingProperties(ngram=1, positional=True, k=30)),\n", - " StringSpec('state', FieldHashingProperties(ngram=2, k=30)),\n", - " IntegerSpec('date_of_birth', FieldHashingProperties(ngram=1, positional=True, k=30, missing_value=MissingValueSpec(sentinel=''))),\n", + " StringSpec('given_name', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " StringSpec('surname', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " IntegerSpec('street_number', FieldHashingProperties(ngram=1, positional=True, num_bits=300, missing_value=MissingValueSpec(sentinel=''))),\n", + " StringSpec('address_1', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " StringSpec('address_2', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " StringSpec('suburb', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " IntegerSpec('postcode', FieldHashingProperties(ngram=1, positional=True, num_bits=300)),\n", + " StringSpec('state', FieldHashingProperties(ngram=2, num_bits=300)),\n", + " IntegerSpec('date_of_birth', FieldHashingProperties(ngram=1, positional=True, num_bits=300, missing_value=MissingValueSpec(sentinel=''))),\n", " Ignore('soc_sec_id')\n", - " ]" + "]\n", + "\n", + "schema = Schema(fields, 1024)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Hash the data\n", "\n", - "We can now hash our PII data from the CSV file using our defined schema. We must provide two *secret keys* to this command - these keys have to be used by both parties hashing data. For this toy example we will use the keys _'key1'_ and _'key2'_, for real data, make sure that the keys contain enough entropy, as knowledge of these keys is sufficient to reconstruct the PII information from a CLK! Also, **do not share these keys with anyone, except the other participating party.**" + "We can now hash our PII data from the CSV file using our defined schema. We must provide a list of *secret keys* to this command - these keys have to be used by both parties hashing data. For this toy example we will use the keys _'key1'_ and _'key2'_, for real data, make sure that the keys contain enough entropy, as knowledge of these keys is sufficient to reconstruct the PII information from a CLK! \n", + "\n", + "Also, **do not share these keys with anyone, except the other participating party.**" ] }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, + "execution_count": 8, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "secret_keys = ('key1', 'key2')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "generating CLKs: 100%|██████████| 5.00k/5.00k [00:02<00:00, 1.86kclk/s, mean=882, std=33.3]\n" + "generating CLKs: 100%|██████████| 5.00k/5.00k [00:01<00:00, 786clk/s, mean=950, std=9.79]\n" ] } ], "source": [ - "from clkhash import clk\n", "a_csv.seek(0)\n", - "hashed_data_a = clk.generate_clk_from_csv(a_csv, ('key1',), schema, validate=False)" + "hashed_data_a = clk.generate_clk_from_csv(a_csv, secret_keys, schema)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Inspect the output\n", "\n", - "clkhash has hashed the PII, creating a Cryptographic Longterm Key for each entity. The output of `generate_clk_from_csv` shows that the mean popcount is quite high (882 out of 1024) which can effect accuracy.\n", + "clkhash has hashed the PII, creating a Cryptographic Longterm Key for each entity. The output of `generate_clk_from_csv` shows that the mean popcount is quite high (950 out of 1024) which can affect accuracy.\n", "\n", - "There are two ways to control the popcount:\n", - "- You can change the _'k'_ value in the hashConfig section of the schema. It controls the number of entries in the CLK for each n-gram\n", - "- or you can modify the individual _'weight'_ values for the different fields. It allows to tune the contribution of a column to the CLK. This can be used to de-emphasise columns which are less suitable for linkage (e.g. information that changes frequently)." + "We can control the popcount by adjusting the hashing strategy. There are currently two different strategies implemented in the library.\n", + "- _fixed k_: each n-gram of a feature's value is inserted into the CLK *k* times. Increasing *k* will give the corresponding feature more importance in comparisons, decreasing *k* will de-emphasise columns which are less suitable for linkage (e.g. information that changes frequently). The _fixed k_ strategy is set with the 'k=30' argument for each feature's FieldHashingProperties. (for a total of numberOfTokens * k insertions)\n", + "- _fixed number of bits_: In this strategy we always insert a fixed number of bits into the CLK for a feature, irrespective of the number of n-grams. This strategy is set with the 'numBits=100' argument for each feature's FieldHashingProperties.\n" ] }, { "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we will change the value of *k* from 30 to 15." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "generating CLKs: 100%|██████████| 5.00k/5.00k [00:01<00:00, 2.69kclk/s, mean=645, std=43.8]\n" - ] - } - ], - "source": [ - "for field in schema.fields[1:-1]:\n", - " field.hashing_properties.k = 15\n", - "a_csv.seek(0)\n", - "hashed_data_a = clk.generate_clk_from_csv(a_csv, ('key1',), schema, validate=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "And now we will modify the weights to de-emphasise the contribution of the address related columns." + "In this example, we will reduce the value of `num_bits` for address related columns." ] }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, + "execution_count": 10, + "metadata": { + "pycharm": {} + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "generating CLKs: 100%|██████████| 5.00k/5.00k [00:01<00:00, 3.07kclk/s, mean=598, std=39.6]\n" + "generating CLKs: 100%|██████████| 5.00k/5.00k [00:00<00:00, 1.41kclk/s, mean=705, std=15.5]\n" ] } ], "source": [ - "schema.fields = [\n", + "fields = [\n", " Ignore('rec_id'),\n", - " StringSpec('given_name', FieldHashingProperties(ngram=2, k=20)),\n", - " StringSpec('surname', FieldHashingProperties(ngram=2, k=20)),\n", - " IntegerSpec('street_number', FieldHashingProperties(ngram=1, positional=True, k=10, missing_value=MissingValueSpec(sentinel=''))),\n", - " StringSpec('address_1', FieldHashingProperties(ngram=2, k=10)),\n", - " StringSpec('address_2', FieldHashingProperties(ngram=2, k=10)),\n", - " StringSpec('suburb', FieldHashingProperties(ngram=2, k=10)),\n", - " IntegerSpec('postcode', FieldHashingProperties(ngram=1, positional=True, k=10)),\n", - " StringSpec('state', FieldHashingProperties(ngram=2, k=10)),\n", - " IntegerSpec('date_of_birth', FieldHashingProperties(ngram=1, positional=True, k=20, missing_value=MissingValueSpec(sentinel=''))),\n", + " StringSpec('given_name', FieldHashingProperties(ngram=2, num_bits=200)),\n", + " StringSpec('surname', FieldHashingProperties(ngram=2, num_bits=200)),\n", + " IntegerSpec('street_number', FieldHashingProperties(ngram=1, positional=True, num_bits=100, missing_value=MissingValueSpec(sentinel=''))),\n", + " StringSpec('address_1', FieldHashingProperties(ngram=2, num_bits=100)),\n", + " StringSpec('address_2', FieldHashingProperties(ngram=2, num_bits=100)),\n", + " StringSpec('suburb', FieldHashingProperties(ngram=2, num_bits=100)),\n", + " IntegerSpec('postcode', FieldHashingProperties(ngram=1, positional=True, num_bits=100)),\n", + " StringSpec('state', FieldHashingProperties(ngram=2, num_bits=100)),\n", + " IntegerSpec('date_of_birth', FieldHashingProperties(ngram=1, positional=True, num_bits=200, missing_value=MissingValueSpec(sentinel=''))),\n", " Ignore('soc_sec_id')\n", - " ]\n", + "]\n", + "\n", + "schema = Schema(fields, 1024)\n", "a_csv.seek(0)\n", - "hashed_data_a = clk.generate_clk_from_csv(a_csv, ('key1', ), schema)" + "hashed_data_a = clk.generate_clk_from_csv(a_csv, secret_keys, schema)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "Each CLK is serialized in a JSON friendly base64 format:" ] }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, + "execution_count": 11, + "metadata": { + "pycharm": {} + }, "outputs": [ { "data": { "text/plain": [ - "'ihux1nrjnvG8i54/Ta6j45oa0v73uW4ZS//b4O1juF7urblhUIvzwIVkxtm7sXcyX70OHwqzLFqeMPdU2U6P9Qls0eaY9Q7My35tD7/z98nVrt1GjM/3He2WW7HqWb3fo/207Xm8BEvq2924+/UYZH6ejO328gr8ka81f/1/3sk='" + "'wTmf3/rPF3Pj/85fORXpee/9+v3/1o9714/7d/bW+G7+9N3Cij///a1//nr/9/cZn/BT9+kWnl9203/eOtvM4G4s3e8lX+7X+f0kXez7XbOfevz7/r6wvN99Mncp367yPeZW3uMYv9Evf9/sPuOq3+p79t6/qn/v7O5e/Jurvr8='" ] }, - "execution_count": 32, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +463,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Hash data set B\n", "\n", @@ -440,14 +474,16 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, + "execution_count": 12, + "metadata": { + "pycharm": {} + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "generating CLKs: 100%|██████████| 5.00k/5.00k [00:01<00:00, 2.74kclk/s, mean=589, std=45.2]\n" + "generating CLKs: 100%|██████████| 5.00k/5.00k [00:00<00:00, 1.45kclk/s, mean=703, std=19.1]\n" ] } ], @@ -455,13 +491,15 @@ "b_csv = io.StringIO()\n", "dfB.to_csv(b_csv)\n", "b_csv.seek(0)\n", - "hashed_data_b = clkhash.clk.generate_clk_from_csv(b_csv, ('key1',), schema, validate=False)" + "hashed_data_b = clkhash.clk.generate_clk_from_csv(b_csv, secret_keys, schema)" ] }, { "cell_type": "code", - "execution_count": 35, - "metadata": {}, + "execution_count": 13, + "metadata": { + "pycharm": {} + }, "outputs": [ { "data": { @@ -469,7 +507,7 @@ "5000" ] }, - "execution_count": 35, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -480,80 +518,27 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Find matches between the two sets of CLKs\n", "\n", "We have generated two sets of CLKs which represent entity information in a privacy-preserving way. The more similar two CLKs are, the more likely it is that they represent the same entity.\n", "\n", - "For this task we will use [anonlink](https://github.com/data61/anonlink), a Python (and optimised C++) implementation of anonymous linkage using CLKs." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting anonlink\n", - "Collecting networkx<=2,>=1.11 (from anonlink)\n", - "Requirement already satisfied, skipping upgrade: clkhash>=0.11 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from anonlink) (0.12.1)\n", - "Requirement already satisfied, skipping upgrade: bitarray>=0.8.1 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from anonlink) (0.8.3)\n", - "Requirement already satisfied, skipping upgrade: mypy-extensions>=0.3 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from anonlink) (0.4.1)\n", - "Requirement already satisfied, skipping upgrade: cffi>=1.7 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from anonlink) (1.12.2)\n", - "Requirement already satisfied, skipping upgrade: numpy>=1.14 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from anonlink) (1.16.2)\n", - "Requirement already satisfied, skipping upgrade: decorator>=4.1.0 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from networkx<=2,>=1.11->anonlink) (4.4.0)\n", - "Requirement already satisfied, skipping upgrade: requests>=2.20 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (2.21.0)\n", - "Requirement already satisfied, skipping upgrade: bashplotlib>=0.6.5 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (0.6.5)\n", - "Requirement already satisfied, skipping upgrade: jsonschema>=2.6 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (3.0.1)\n", - "Requirement already satisfied, skipping upgrade: click>=6.7 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (7.0)\n", - "Requirement already satisfied, skipping upgrade: future>=0.16 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (0.17.1)\n", - "Requirement already satisfied, skipping upgrade: tqdm>=4.24 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (4.31.1)\n", - "Requirement already satisfied, skipping upgrade: cryptography>=2.3 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from clkhash>=0.11->anonlink) (2.6.1)\n", - "Requirement already satisfied, skipping upgrade: pycparser in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from cffi>=1.7->anonlink) (2.19)\n", - "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from requests>=2.20->clkhash>=0.11->anonlink) (2019.3.9)\n", - "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from requests>=2.20->clkhash>=0.11->anonlink) (2.8)\n", - "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.21.1 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from requests>=2.20->clkhash>=0.11->anonlink) (1.24.1)\n", - "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from requests>=2.20->clkhash>=0.11->anonlink) (3.0.4)\n", - "Requirement already satisfied, skipping upgrade: six>=1.11.0 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from jsonschema>=2.6->clkhash>=0.11->anonlink) (1.12.0)\n", - "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from jsonschema>=2.6->clkhash>=0.11->anonlink) (0.14.11)\n", - "Requirement already satisfied, skipping upgrade: setuptools in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from jsonschema>=2.6->clkhash>=0.11->anonlink) (40.8.0)\n", - "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from jsonschema>=2.6->clkhash>=0.11->anonlink) (19.1.0)\n", - "Requirement already satisfied, skipping upgrade: asn1crypto>=0.21.0 in /Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages (from cryptography>=2.3->clkhash>=0.11->anonlink) (0.24.0)\n", - "Installing collected packages: networkx, anonlink\n", - "Successfully installed anonlink-0.11.2 networkx-2.0\n" - ] - } - ], - "source": [ - "!pip install -U anonlink" + "For this task we will use [anonlink](https://github.com/data61/anonlink), a Python (and optimised C++) implementation of anonymous linkage using CLKs. \n", + "\n", + "As the CLKs are in a string format we first deserialize to use the bitarray type:" ] }, { "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/hen271/.local/share/virtualenvs/jupyter_temp/lib/python3.6/site-packages/ipykernel_launcher.py:21: DeprecationWarning: anonlink.anonlink.entitymatch.calculate_mapping_greedy has been deprecated without replacement\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "found 4990 matches\n" - ] - } - ], + "execution_count": 14, + "metadata": { + "pycharm": {} + }, + "outputs": [], "source": [ - "from anonlink.entitymatch import calculate_mapping_greedy\n", "from bitarray import bitarray\n", "import base64\n", "\n", @@ -567,49 +552,78 @@ " res = []\n", " for i, f in enumerate(filters):\n", " ba = deserialize_bitarray(f)\n", - " res.append((ba, i, ba.count()))\n", + " res.append(ba)\n", " return res\n", "\n", "clks_a = deserialize_filters(hashed_data_a)\n", - "clks_b = deserialize_filters(hashed_data_b)\n", - "\n", - "mapping = calculate_mapping_greedy(clks_a, clks_b, threshold=0.9, k=5000)\n", - "print('found {} matches'.format(len(mapping)))" + "clks_b = deserialize_filters(hashed_data_b)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "Let's investigate some of those matches and the overall matching quality" + "Using `anonlink` we find the candidate pairs - which is all possible pairs above the given `threshold`. Then we solve for the most likely mapping." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "pycharm": {} + }, + "outputs": [], + "source": [ + "import anonlink\n", + "\n", + "def mapping_from_clks(clks_a, clks_b, threshold):\n", + " results_candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(\n", + " [clks_a, clks_b],\n", + " anonlink.similarities.dice_coefficient,\n", + " threshold\n", + " )\n", + " solution = anonlink.solving.greedy_solve(results_candidate_pairs)\n", + " print('Found {} matches'.format(len(solution)))\n", + " return {a:b for ((_, a),(_, b)) in solution}" ] }, { "cell_type": "code", - "execution_count": 44, - "metadata": {}, + "execution_count": 16, + "metadata": { + "pycharm": {} + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "idx_a, idx_b, rec_id_a, rec_id_b\n", - "--------------------------------\n", - "1, 1450, rec-1070-org, rec-1070-dup-0\n", - "2, 2751, rec-1016-org, rec-1016-dup-0\n", - "3, 4657, rec-4405-org, rec-4405-dup-0\n", - "4, 4120, rec-1288-org, rec-1288-dup-0\n", - "5, 3307, rec-3585-org, rec-3585-dup-0\n", - "6, 2306, rec-298-org, rec-298-dup-0\n", - "7, 3945, rec-1985-org, rec-1985-dup-0\n", - "8, 993, rec-2404-org, rec-2404-dup-0\n", - "9, 4613, rec-1473-org, rec-1473-dup-0\n", - "10, 3630, rec-453-org, rec-453-dup-0\n", - "--------------------------------\n", - "Precision: 0.9889779559118237, Recall: 0.987, Accuracy: 0.9762611275964391\n" + "Found 4019 matches\n" ] } ], + "source": [ + "mapping = mapping_from_clks(clks_a, clks_b, 0.9)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": {} + }, + "source": [ + "Let's investigate some of those matches and the overall matching quality" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "pycharm": {} + }, + "outputs": [], "source": [ "a_csv.seek(0)\n", "b_csv.seek(0)\n", @@ -617,53 +631,120 @@ "b_raw = b_csv.readlines()\n", "\n", "num_entities = len(b_raw) - 1\n", - "\n", - "print('idx_a, idx_b, rec_id_a, rec_id_b')\n", - "print('--------------------------------')\n", - "for a_i in range(10):\n", - " if a_i in mapping:\n", - " a_data = a_raw[a_i + 1].split(',')\n", - " b_data = b_raw[mapping[a_i] + 1].split(',')\n", - " print('{}, {}, {}, {}'.format(a_i+1, mapping[a_i]+1, a_data[0], b_data[0]))\n", - "\n", - "TP = 0; FP = 0; TN = 0; FN = 0\n", - "for a_i in range(num_entities):\n", - " if a_i in mapping:\n", - " if a_raw[a_i + 1].split(',')[0].split('-')[1] == b_raw[mapping[a_i] + 1].split(',')[0].split('-')[1]:\n", - " TP += 1\n", + " \n", + "def describe_accuracy(mapping, show_examples=False):\n", + " if show_examples:\n", + " print('idx_a, idx_b, rec_id_a, rec_id_b')\n", + " print('---------------------------------------------')\n", + " for a_i in range(10):\n", + " if a_i in mapping:\n", + " a_data = a_raw[a_i + 1].split(',')\n", + " b_data = b_raw[mapping[a_i] + 1].split(',')\n", + " print('{:3}, {:6}, {:>15}, {:>15}'.format(a_i+1, mapping[a_i]+1, a_data[0], b_data[0]))\n", + " print('---------------------------------------------')\n", + " \n", + " TP = 0; FP = 0; TN = 0; FN = 0\n", + " for a_i in range(num_entities):\n", + " if a_i in mapping:\n", + " if a_raw[a_i + 1].split(',')[0].split('-')[1] == b_raw[mapping[a_i] + 1].split(',')[0].split('-')[1]:\n", + " TP += 1\n", + " else:\n", + " FP += 1\n", + " # as we only report one mapping for each element in PII_a, \n", + " # then a wrong mapping is not only a false positive, but \n", + " # also a false negative, as we won't report the true mapping.\n", + " FN += 1 \n", " else:\n", - " FP += 1\n", - " FN += 1 # as we only report one mapping for each element in PII_a, then a wrong mapping is not only a false positive, but also a false negative, as we won't report the true mapping.\n", - " else:\n", - " FN += 1 # every element in PII_a has a partner in PII_b\n", + " FN += 1 # every element in PII_a has a partner in PII_b\n", "\n", - "print('--------------------------------')\n", - "print('Precision: {}, Recall: {}, Accuracy: {}'.format(TP/(TP+FP), TP/(TP+FN), (TP+TN)/(TP+TN+FP+FN)))" + " print()\n", + " print(\"We've got {} true positives, {} false positives, and {} false negatives.\".format(TP, FP, FN))\n", + " print('Precision: {:.3f}, Recall: {:.3f}, Accuracy: {:.3f}'.format(\n", + " TP/(TP+FP), \n", + " TP/(TP+FN), \n", + " (TP+TN)/(TP+TN+FP+FN)))\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "idx_a, idx_b, rec_id_a, rec_id_b\n", + "---------------------------------------------\n", + " 2, 2751, rec-1016-org, rec-1016-dup-0\n", + " 3, 4657, rec-4405-org, rec-4405-dup-0\n", + " 4, 4120, rec-1288-org, rec-1288-dup-0\n", + " 5, 3307, rec-3585-org, rec-3585-dup-0\n", + " 6, 2306, rec-298-org, rec-298-dup-0\n", + " 7, 3945, rec-1985-org, rec-1985-dup-0\n", + " 8, 993, rec-2404-org, rec-2404-dup-0\n", + " 9, 4613, rec-1473-org, rec-1473-dup-0\n", + " 10, 3630, rec-453-org, rec-453-dup-0\n", + "---------------------------------------------\n", + "\n", + "We've got 4019 true positives, 0 false positives, and 981 false negatives.\n", + "Precision: 1.000, Recall: 0.804, Accuracy: 0.804\n" + ] + } + ], + "source": [ + "describe_accuracy(mapping, show_examples=True)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "Precision tells us about how many of the found matches are actual matches. The score of 1.0 means that we did perfectly in this respect, however, recall, the measure of how many of the actual matches were correctly identified, is quite low with only 72.5%.\n", - "\n", - "Let's go back to the mapping calculation (`calculate_mapping_greedy`) an reduce the value for `threshold` to `0.8`.\n", - "\n", - "Great, for this threshold value we get a precision of 100% and a recall of 95.4%. \n", - "\n", - "The explanation is that when the information about an entity differs slightly in the two datasets (e.g. spelling errors, abbrevations, missing values, ...) then the corresponding CLKs will differ in some number of bits as well. For the datasets in this tutorial the perturbations are such that only 72.5% of the derived CLK pairs overlap more than 90%. Whereas almost all matching pairs overlap more than 80%.\n", - "\n", - "If we keep reducing the threshold value, then we will start to observe mistakes in the found matches -- the precision decreases. But at the same time the recall value will keep increasing for a while, as a lower threshold allows for more of the actual matches to be found, e.g.: for threshold 0.72, we get precision: 0.997 and recall: 0.992. However, reducing the threshold further will eventually lead to a decrease in both precision and recall: for threshold 0.65 precision is 0.989 and recall is 0.987. Thus it is important to choose an appropriate threshold for the amount of perturbations present in the data.\n", + "Precision tells us about how many of the found matches are actual matches. The score of 1.0 means that we did perfectly in this respect, however, recall, the measure of how many of the actual matches were correctly identified, is quite low with only 80%.\n", "\n", - "This concludes the tutorial. Feel free to go back to the CLK generation and experiment on how different setting will affect the matching quality." + "Let's go back to the mapping calculation (`calculate_mapping_greedy`) an reduce the value for `threshold` to `0.8`." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 19, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 4974 matches\n", + "\n", + "We've got 4974 true positives, 0 false positives, and 26 false negatives.\n", + "Precision: 1.000, Recall: 0.995, Accuracy: 0.995\n" + ] + } + ], + "source": [ + "mapping = mapping_from_clks(clks_a, clks_b, 0.8)\n", + "describe_accuracy(mapping)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": {} + }, + "source": [ + "Great, for this threshold value we get a precision of 100% and a recall of 99.6%. \n", + "\n", + "The explanation is that when the information about an entity differs slightly in the two datasets (e.g. spelling errors, abbrevations, missing values, ...) then the corresponding CLKs will differ in some number of bits as well. It is important to choose an appropriate threshold for the amount of perturbations present in the data (a threshold of 0.74 and below generates a mapping which misses just one true match).\n", + "\n", + "This concludes the tutorial. Feel free to go back to the CLK generation and experiment on how different setting will affect the matching quality." + ] } ], "metadata": { @@ -682,7 +763,16 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.7.2" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } } }, "nbformat": 4, diff --git a/docs/tutorial_cli.ipynb b/docs/tutorial_cli.ipynb index 9dc7059a..69c9c28e 100644 --- a/docs/tutorial_cli.ipynb +++ b/docs/tutorial_cli.ipynb @@ -2,33 +2,33 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "# Tutorial for CLI tool `clkhash`\n", "\n", - "For this tutorial we are going to process a data set for private linkage with clkhash using the command line tool `clkutil`. Note you can also use the [Python API](./tutorial_api.ipynb).\n", + "For this tutorial we are going to process a data set for private linkage with clkhash using the command line tool `clkutil` - equivalent to running `python -m clkhash`.\n", + "\n", + "Note you can also use the [Python API](./tutorial_api.ipynb).\n", "\n", "The Python package `recordlinkage` has a [tutorial](http://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html) linking data sets in the clear, we will try duplicate that in a privacy preserving setting.\n", "\n", - "First install clkhash, recordlinkage and a few data science tools (pandas and numpy)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -U clkhash recordlinkage numpy pandas" + "First install clkhash, recordlinkage and a few data science tools (pandas and numpy).\n", + "\n", + " $ pip install -U clkhash recordlinkage numpy pandas" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ - "import io\n", "import json\n", "import numpy as np\n", "import pandas as pd" @@ -37,7 +37,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ "import recordlinkage\n", @@ -46,7 +50,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Data Exploration\n", "\n", @@ -56,7 +62,11 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "data": { @@ -213,7 +223,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "Note that for computing this linkage we will **not** use the social security id column or the `rec_id` index." ] @@ -221,7 +233,11 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "data": { @@ -243,7 +259,11 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ "dfA.to_csv('PII_a.csv')" @@ -251,7 +271,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Hashing Schema Definition\n", "\n", @@ -262,95 +284,126 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Overwriting schema.json\n" + "{\n", + " \"version\": 2,\n", + " \"clkConfig\": {\n", + " \"l\": 1024,\n", + " \"kdf\": {\n", + " \"type\": \"HKDF\",\n", + " \"hash\": \"SHA256\",\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"keySize\": 64\n", + " }\n", + " },\n", + " \"features\": [\n", + " {\n", + " \"identifier\": \"rec_id\",\n", + " \"ignored\": true\n", + " },\n", + " {\n", + " \"identifier\": \"given_name\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"surname\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"street_number\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 300}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_1\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_2\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300} }\n", + " },\n", + " {\n", + " \"identifier\": \"suburb\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300} }\n", + " },\n", + " {\n", + " \"identifier\": \"postcode\",\n", + " \"format\": { \"type\": \"integer\", \"minimum\": 100, \"maximum\": 9999 },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 300} }\n", + " },\n", + " {\n", + " \"identifier\": \"state\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 3 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 300} }\n", + " },\n", + " {\n", + " \"identifier\": \"date_of_birth\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 300}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"soc_sec_id\",\n", + " \"ignored\": true\n", + " }\n", + " ]\n", + "}\n" ] } ], "source": [ - "%%writefile schema.json\n", - "{\n", - " \"version\": 1,\n", - " \"clkConfig\": {\n", - " \"l\": 1024,\n", - " \"k\": 30,\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"kdf\": {\n", - " \"type\": \"HKDF\",\n", - " \"hash\": \"SHA256\",\n", - " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", - " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", - " \"keySize\": 64\n", - " }\n", - " },\n", - " \"features\": [\n", - " {\n", - " \"identifier\": \"rec_id\",\n", - " \"ignored\": true\n", - " },\n", - " {\n", - " \"identifier\": \"given_name\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"surname\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"street_number\",\n", - " \"format\": { \"type\": \"integer\" },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 1, \"missingValue\": {\"sentinel\": \"\"} }\n", - " },\n", - " {\n", - " \"identifier\": \"address_1\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"address_2\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"suburb\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"postcode\",\n", - " \"format\": { \"type\": \"integer\", \"minimum\": 100, \"maximum\": 9999 },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"state\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 3 },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", - " },\n", - " {\n", - " \"identifier\": \"date_of_birth\",\n", - " \"format\": { \"type\": \"integer\" },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 1, \"missingValue\": {\"sentinel\": \"\"} }\n", - " },\n", - " {\n", - " \"identifier\": \"soc_sec_id\",\n", - " \"ignored\": true\n", - " }\n", - " ]\n", - "}" + "%cat _static/febrl_schema_v2_overweight.json" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, + "source": [ + "## Validate the schema\n", + "\n", + "The command line tool can check that the linkage schema is valid:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mschema is valid\u001b[0m\n" + ] + } + ], + "source": [ + "!clkutil validate-schema _static/febrl_schema_v2_overweight.json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": {} + }, "source": [ "## Hash the data\n", "\n", @@ -359,116 +412,301 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 8, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:05<00:00, 927clk/s, mean=885, std=33.4]\n", + "generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 1.27kclk/s, mean=949, std=9.82]\n", "\u001b[31mCLK data written to clks_a.json\u001b[0m\n" ] } ], "source": [ - "!clkutil hash PII_a.csv key1 key2 schema.json clks_a.json" + "!clkutil hash PII_a.csv key1 key2 _static/febrl_schema_v2_overweight.json clks_a.json" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Inspect the output\n", "\n", - "clkhash has hashed the PII, creating a Cryptographic Longterm Key for each entity. The progress bar output shows that the mean popcount is quite high (885 out of 1024) which can effect accuracy.\n", + "clkhash has hashed the PII, creating a Cryptographic Longterm Key for each entity. The stats output shows that the mean popcount (number of bits set) is quite high (949 out of 1024) which can effect accuracy.\n", "\n", - "There are two ways to control the popcount:\n", - "- You can change the _'k'_ value in the `clkConfig` section of the linkage schema. This controls the number of entries in the CLK for each n-gram\n", - "- or you can modify the individual _'weight'_ values for the different fields. It allows to tune the contribution of a column to the CLK. This can be used to de-emphasise columns which are less suitable for linkage (e.g. information that changes frequently)." + "To reduce the popcount you can modify the individual _'numBits'_ values for the different fields. It allows to tune the contribution of a column to the CLK. This can be used to de-emphasise columns which are less suitable for linkage (e.g. information that changes frequently)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "First, we will change the value of *k* from 30 to 15." + "First, we will reduce the value of *numBits* for each feature." ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 9, + "metadata": { + "pycharm": {} + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:04<00:00, 867clk/s, mean=648, std=44.1]\n", + "{\n", + " \"version\": 2,\n", + " \"clkConfig\": {\n", + " \"l\": 1024,\n", + " \"kdf\": {\n", + " \"type\": \"HKDF\",\n", + " \"hash\": \"SHA256\",\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"keySize\": 64\n", + " }\n", + " },\n", + " \"features\": [\n", + " {\n", + " \"identifier\": \"rec_id\",\n", + " \"ignored\": true\n", + " },\n", + " {\n", + " \"identifier\": \"given_name\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"surname\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"street_number\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 200}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_1\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_2\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200} }\n", + " },\n", + " {\n", + " \"identifier\": \"suburb\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200} }\n", + " },\n", + " {\n", + " \"identifier\": \"postcode\",\n", + " \"format\": { \"type\": \"integer\", \"minimum\": 100, \"maximum\": 9999 },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 200} }\n", + " },\n", + " {\n", + " \"identifier\": \"state\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 3 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200} }\n", + " },\n", + " {\n", + " \"identifier\": \"date_of_birth\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 200}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"soc_sec_id\",\n", + " \"ignored\": true\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "%cat _static/febrl_schema_v2_reduced.json" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 1.74kclk/s, mean=843, std=13.8]\n", "\u001b[31mCLK data written to clks_a.json\u001b[0m\n" ] } ], "source": [ - "schema = json.load(open('schema.json', 'rt'))\n", - "schema['clkConfig']['k'] = 15\n", - "json.dump(schema, open('schema.json', 'wt'))\n", - "\n", - "!clkutil hash PII_a.csv key1 key2 schema.json clks_a.json" + "!clkutil hash PII_a.csv key1 key2 _static/febrl_schema_v2_reduced.json clks_a.json" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "And now we will modify the weights to de-emphasise the contribution of the address related columns." + "And now we will modify the `numBits` values again, this time de-emphasising the contribution of the address related columns." ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 11, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:04<00:00, 924clk/s, mean=602, std=39.8]\n", + "{\n", + " \"version\": 2,\n", + " \"clkConfig\": {\n", + " \"l\": 1024,\n", + " \"kdf\": {\n", + " \"type\": \"HKDF\",\n", + " \"hash\": \"SHA256\",\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"keySize\": 64\n", + " }\n", + " },\n", + " \"features\": [\n", + " {\n", + " \"identifier\": \"rec_id\",\n", + " \"ignored\": true\n", + " },\n", + " {\n", + " \"identifier\": \"given_name\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"surname\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 64 },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 200}, \"hash\": {\"type\": \"doubleHash\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"street_number\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 100}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_1\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 100} }\n", + " },\n", + " {\n", + " \"identifier\": \"address_2\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 100} }\n", + " },\n", + " {\n", + " \"identifier\": \"suburb\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"strategy\": {\"numBits\": 100} }\n", + " },\n", + " {\n", + " \"identifier\": \"postcode\",\n", + " \"format\": { \"type\": \"integer\", \"minimum\": 50, \"maximum\": 9999 },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 100} }\n", + " },\n", + " {\n", + " \"identifier\": \"state\",\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\"},\n", + " \"hashing\": {\"ngram\": 2, \"positional\": true, \"strategy\": {\"numBits\": 100}, \"missingValue\": {\"sentinel\": \"\"}\n", + " }\n", + " },\n", + " {\n", + " \"identifier\": \"date_of_birth\",\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"strategy\": {\"numBits\": 200}, \"missingValue\": {\"sentinel\": \"\"} }\n", + " },\n", + " {\n", + " \"identifier\": \"soc_sec_id\",\n", + " \"ignored\": true\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "%cat _static/febrl_schema_v2_final.json" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 7.08kclk/s, mean=705, std=16] \n", "\u001b[31mCLK data written to clks_a.json\u001b[0m\n" ] } ], "source": [ - "schema = json.load(open('schema.json', 'rt'))\n", - "schema['clkConfig']['k'] = 20\n", - "address_features = ['street_number', 'address_1', 'address_2', 'suburb', 'postcode', 'state']\n", - "for feature in schema['features']:\n", - " if feature['identifier'] in address_features:\n", - " feature['hashing']['weight'] = 0.5\n", - "json.dump(schema, open('schema.json', 'wt'))\n", - "\n", - "!clkutil hash PII_a.csv key1 key2 schema.json clks_a.json" + "!clkutil hash PII_a.csv key1 key2 _static/febrl_schema_v2_final.json clks_a.json" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ + "Great, now approximately half the bits are set in each CLK. \n", + "\n", "Each CLK is serialized in a JSON friendly base64 format:" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 13, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "data": { "text/plain": [ - "'BD8JWW7DzwP82PjV5/jbN40+bT3V4z7V+QBtHYcdF32WpPvDvHUdLXCX3tuV1/4rv+23v9R1fKmJcmoNi7OvoecRLMnHzqv9J5SfT15VXe7KPht9d49zRt73+l3Tfs+Web8kx32vSdo+SfnlHqKbn11V6w9zFm3kb07e67MX7tw='" + "'unsZ/W7D35s8q759bf77155ean+p8fq96fzf9u9bnXf3rX2gGfntPvR2/tOd314aOvuv/97z+lrY8st+fP8PYVd9/KjZN6rMx+T/O6r/v/Hdvt1f1at2+f+Xe53iX94f9988b3mhTsIQbf+7Xr3Sff71fuze9k3sX++db4d73v0='" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -483,7 +721,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Hash data set B\n", "\n", @@ -492,14 +732,18 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, + "execution_count": 14, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:04<00:00, 964clk/s, mean=592, std=45.5]\n", + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 7.11kclk/s, mean=703, std=19.4]\n", "\u001b[31mCLK data written to clks_b.json\u001b[0m\n" ] } @@ -507,12 +751,14 @@ "source": [ "dfB.to_csv('PII_b.csv')\n", "\n", - "!clkutil hash PII_b.csv key1 key2 schema.json clks_b.json" + "!clkutil hash PII_b.csv key1 key2 _static/febrl_schema_v2_final.json clks_b.json" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Find matches between the two sets of CLKs\n", "\n", @@ -531,51 +777,62 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, + "execution_count": 15, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 223, \"rate\": 52027343, \"status\": \"ok\"}\n" + "{\"project_count\": 772, \"rate\": 2083409, \"status\": \"ok\"}\n" ] } ], "source": [ "SERVER = 'https://testing.es.data61.xyz'\n", + "\n", "!clkutil status --server={SERVER}" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "The analyst creates a new project on the entity service by providing the hashing schema and result type. The server returns a set of credentials which provide access to the further steps for project." ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 16, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Entity Matching Server: https://testing.es.data61.xyz\n", - "Checking server status\n", - "Server Status: ok\n" + "\u001b[31mProject created\u001b[0m\n" ] } ], "source": [ - "!clkutil create-project --server={SERVER} --schema schema.json --output credentials.json --type \"mapping\" --name \"tutorial\"" + "!clkutil create-project --server={SERVER} --schema _static/febrl_schema_v2_final.json --output credentials.json --type \"mapping\" --name \"tutorial\"" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "The returned credentials contain a \n", "- `project_id`, which identifies the project\n", @@ -585,19 +842,23 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "execution_count": 17, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", - " \"project_id\": \"5c9a47049161bcb3f32dd1fef4c71c1df9cc7658f5e2cd55\",\n", - " \"result_token\": \"2886b2faf85ad994339059f192a1b8f32206ec32d878b160\",\n", + " \"project_id\": \"90fbe7e6de6e085afffdd8403722d44ffc18756b9e204c8f\",\n", + " \"result_token\": \"c2b0b8e133dae678eaaca1cdb32f28493cb7a7ff5728ed79\",\n", " \"update_tokens\": [\n", - " \"7d08294eed16bbe8b3189d193358258b3b5045e67f44306f\",\n", - " \"04da88e3a5e90aa55049c5a2e8a7085a8bc691653d895447\"\n", + " \"a451f0b5a3cc4829701f236deaa036955bed9aceb34f7242\",\n", + " \"710b1e1f7485e184eb1ffac1d97d41e3fa2a845b2d4ea70d\"\n", " ]\n", "}\n" ] @@ -605,12 +866,14 @@ ], "source": [ "credentials = json.load(open('credentials.json', 'rt'))\n", - "!python -m json.tool credentials.json" + "print(json.dumps(credentials, indent=4))" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "### Uploading the CLKs to the entity service\n", "Each party individually uploads its respective CLKs to the entity service. They need to provide the `resource_id`, which identifies the correct mapping, and an `update_token`." @@ -618,28 +881,13 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Uploading CLK data from clks_a.json\n", - "To Entity Matching Server: https://testing.es.data61.xyz\n", - "Project ID: 5c9a47049161bcb3f32dd1fef4c71c1df9cc7658f5e2cd55\n", - "Checking server status\n", - "Status: ok\n", - "Uploading CLK data to the server\n", - "Uploading CLK data from clks_b.json\n", - "To Entity Matching Server: https://testing.es.data61.xyz\n", - "Project ID: 5c9a47049161bcb3f32dd1fef4c71c1df9cc7658f5e2cd55\n", - "Checking server status\n", - "Status: ok\n", - "Uploading CLK data to the server\n" - ] + "execution_count": 18, + "metadata": { + "pycharm": { + "is_executing": false } - ], + }, + "outputs": [], "source": [ "!clkutil upload \\\n", " --project=\"{credentials['project_id']}\" \\\n", @@ -658,23 +906,25 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "Now that the CLK data has been uploaded the analyst can create one or more *runs*. Here we will start by calculating a mapping with a threshold of `0.9`:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "execution_count": 19, + "metadata": { + "pycharm": {} + }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Entity Matching Server: https://testing.es.data61.xyz\n", - "Checking server status\n", - "Server Status: ok\n" + "\u001b[31mEntity Matching Server: https://testing.es.data61.xyz\u001b[0m\n" ] } ], @@ -685,24 +935,26 @@ " --threshold=0.9 \\\n", " --project=\"{credentials['project_id']}\" \\\n", " --apikey=\"{credentials['result_token']}\" \\\n", - " --name=\"tutorial_run\"" + " --name=\"CLI tutorial run A\"" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, + "execution_count": 20, + "metadata": { + "pycharm": {} + }, "outputs": [ { "data": { "text/plain": [ - "{'name': 'tutorial_run',\n", - " 'notes': 'Run created by clkhash command line tool',\n", - " 'run_id': 'b700b16393eb5eb704322497226078c36ad9e16724797239',\n", + "{'name': 'CLI tutorial run A',\n", + " 'notes': 'Run created by clkhash 0.13.0',\n", + " 'run_id': '1dd059e252d2f31c7e9aeacf2c5e69a59a0a5592add1dbd3',\n", " 'threshold': 0.9}" ] }, - "execution_count": 23, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -714,7 +966,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ "## Results\n", "\n", @@ -723,126 +977,376 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 21, + "metadata": { + "pycharm": {}, + "tags": [ + "nbval-ignore-output" + ] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The service linked 3636 entities.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Checking server status\n", - "Status: ok\n", - "Response code: 200\n", - "Received result\n" + "\u001b[31mState: running\n", + "Stage (2/3): compute similarity scores\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mDownloading result\u001b[0m\n", + "\u001b[31mReceived result\u001b[0m\n" ] } ], "source": [ - "!clkutil results \\\n", + "!clkutil results --watch \\\n", " --project=\"{credentials['project_id']}\" \\\n", " --apikey=\"{credentials['result_token']}\" \\\n", " --run=\"{run_info['run_id']}\" \\\n", " --server=\"{SERVER}\" \\\n", - " --output results.txt\n", - "\n", + " --output results.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The service linked 4001 entities.\n" + ] + } + ], + "source": [ "with open('results.txt') as f:\n", " str_mapping = json.load(f)['mapping']\n", - " mapping = {int(k): int(v) for k,v in str_mapping.items()}\n", + "\n", + "mapping = {int(k): int(v) for k,v in str_mapping.items()}\n", "print('The service linked {} entities.'.format(len(mapping)))" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "Let's investigate some of those matches and the overall matching quality" + "Let's investigate some of those matches and the overall matching quality. In this case we have the ground truth so we can compute the precision, recall, and accuracy." ] }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, + "execution_count": 23, + "metadata": { + "pycharm": {} + }, + "outputs": [], + "source": [ + "with open('PII_a.csv', 'rt') as f:\n", + " a_raw = f.readlines()\n", + "with open('PII_b.csv', 'rt') as f:\n", + " b_raw = f.readlines()\n", + "\n", + "num_entities = len(b_raw) - 1\n", + "\n", + "def describe_accuracy(mapping, show_examples=False):\n", + " if show_examples:\n", + " print('idx_a, idx_b, rec_id_a, rec_id_b')\n", + " print('---------------------------------------------')\n", + " for a_i in range(10):\n", + " if a_i in mapping:\n", + " a_data = a_raw[a_i + 1].split(',')\n", + " b_data = b_raw[mapping[a_i] + 1].split(',')\n", + " print('{:3}, {:6}, {:>15}, {:>15}'.format(a_i+1, mapping[a_i]+1, a_data[0], b_data[0]))\n", + " print('---------------------------------------------')\n", + " \n", + " TP = 0; FP = 0; TN = 0; FN = 0\n", + " for a_i in range(num_entities):\n", + " if a_i in mapping:\n", + " if a_raw[a_i + 1].split(',')[0].split('-')[1] == b_raw[mapping[a_i] + 1].split(',')[0].split('-')[1]:\n", + " TP += 1\n", + " else:\n", + " FP += 1\n", + " # as we only report one mapping for each element in PII_a, \n", + " # then a wrong mapping is not only a false positive, but \n", + " # also a false negative, as we won't report the true mapping.\n", + " FN += 1 \n", + " else:\n", + " FN += 1 # every element in PII_a has a partner in PII_b\n", + "\n", + "\n", + " print('Precision: {:.2f}, Recall: {:.2f}, Accuracy: {:.2f}'.format(\n", + " TP/(TP+FP), \n", + " TP/(TP+FN), \n", + " (TP+TN)/(TP+TN+FP+FN)))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "pycharm": {} + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "idx_a, idx_b, rec_id_a, rec_id_b\n", - "--------------------------------\n", - "2, 2751, rec-1016-org, rec-1016-dup-0\n", - "3, 4657, rec-4405-org, rec-4405-dup-0\n", - "4, 4120, rec-1288-org, rec-1288-dup-0\n", - "5, 3307, rec-3585-org, rec-3585-dup-0\n", - "7, 3945, rec-1985-org, rec-1985-dup-0\n", - "8, 993, rec-2404-org, rec-2404-dup-0\n", - "9, 4613, rec-1473-org, rec-1473-dup-0\n", - "10, 3630, rec-453-org, rec-453-dup-0\n", - "--------------------------------\n", - "Precision: 1.0, Recall: 0.7272, Accuracy: 0.7272\n" + "idx_a, idx_b, rec_id_a, rec_id_b\n", + "---------------------------------------------\n", + " 2, 2751, rec-1016-org, rec-1016-dup-0\n", + " 3, 4657, rec-4405-org, rec-4405-dup-0\n", + " 4, 4120, rec-1288-org, rec-1288-dup-0\n", + " 5, 3307, rec-3585-org, rec-3585-dup-0\n", + " 6, 2306, rec-298-org, rec-298-dup-0\n", + " 7, 3945, rec-1985-org, rec-1985-dup-0\n", + " 8, 993, rec-2404-org, rec-2404-dup-0\n", + " 9, 4613, rec-1473-org, rec-1473-dup-0\n", + " 10, 3630, rec-453-org, rec-453-dup-0\n", + "---------------------------------------------\n", + "Precision: 1.00, Recall: 0.80, Accuracy: 0.80\n" ] } ], "source": [ - "with open('PII_a.csv', 'rt') as f:\n", - " a_raw = f.readlines()\n", - "with open('PII_b.csv', 'rt') as f:\n", - " b_raw = f.readlines()\n", + "describe_accuracy(mapping, True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": {} + }, + "source": [ + "Precision tells us about how many of the found matches are actual matches. The score of 1.0 means that we did perfectly in this respect, however, **recall**, the measure of how many of the actual matches were correctly identified, is quite low with only 80%.\n", "\n", - "num_entities = len(b_raw) - 1\n", + "Let's go back and create another mapping with a `threshold` value of `0.8`." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mEntity Matching Server: https://testing.es.data61.xyz\u001b[0m\n" + ] + } + ], + "source": [ + "!clkutil create --verbose \\\n", + " --server=\"{SERVER}\" \\\n", + " --output \"run_info.json\" \\\n", + " --threshold=0.8 \\\n", + " --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --name=\"CLI tutorial run B\"\n", "\n", - "print('idx_a, idx_b, rec_id_a, rec_id_b')\n", - "print('--------------------------------')\n", - "for a_i in range(10):\n", - " if a_i in mapping:\n", - " a_data = a_raw[a_i + 1].split(',')\n", - " b_data = b_raw[mapping[a_i] + 1].split(',')\n", - " print('{}, {}, {}, {}'.format(a_i+1, mapping[a_i]+1, a_data[0], b_data[0]))\n", - "\n", - "TP = 0; FP = 0; TN = 0; FN = 0\n", - "for a_i in range(num_entities):\n", - " if a_i in mapping:\n", - " if a_raw[a_i + 1].split(',')[0].split('-')[1] == b_raw[mapping[a_i] + 1].split(',')[0].split('-')[1]:\n", - " TP += 1\n", - " else:\n", - " FP += 1\n", - " FN += 1 # as we only report one mapping for each element in PII_a, then a wrong mapping is not only a false positive, but also a false negative, as we won't report the true mapping.\n", - " else:\n", - " FN += 1 # every element in PII_a has a partner in PII_b\n", + "run_info = json.load(open('run_info.json', 'rt'))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "pycharm": {}, + "tags": [ + "nbval-ignore-output" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mState: running\n", + "Stage (2/3): compute similarity scores\u001b[0m\n", + "\u001b[31mState: running\n", + "Stage (2/3): compute similarity scores\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mDownloading result\u001b[0m\n", + "\u001b[31mReceived result\u001b[0m\n" + ] + } + ], + "source": [ + "!clkutil results --watch \\\n", + " --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --run=\"{run_info['run_id']}\" \\\n", + " --server=\"{SERVER}\" \\\n", + " --output results.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The service linked 4975 entities.\n", + "Precision: 1.00, Recall: 0.99, Accuracy: 0.99\n" + ] + } + ], + "source": [ + "with open('results.txt') as f:\n", + " str_mapping = json.load(f)['mapping']\n", + "\n", + "mapping = {int(k): int(v) for k,v in str_mapping.items()}\n", "\n", - "print('--------------------------------')\n", - "print('Precision: {}, Recall: {}, Accuracy: {}'.format(TP/(TP+FP), TP/(TP+FN), (TP+TN)/(TP+TN+FP+FN)))" + "print('The service linked {} entities.'.format(len(mapping)))\n", + "describe_accuracy(mapping)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": {} + }, "source": [ - "Precision tells us about how many of the found matches are actual matches. The score of 1.0 means that we did perfectly in this respect, however, recall, the measure of how many of the actual matches were correctly identified, is quite low with only 73%.\n", + "Great, for this threshold value we get a precision of 100% and a recall of 99%. \n", + "\n", + "The explanation is that when the information about an entity differs slightly in the two datasets (e.g. spelling errors, abbrevations, missing values, ...) then the corresponding CLKs will differ in some number of bits as well. For the datasets in this tutorial the perturbations are such that only 80% of the derived CLK pairs overlap more than 90% (the first threshold). Whereas 99% of all matching pairs overlap more than 80%.\n", "\n", - "Let's go back and create another mapping with a `threshold` value of `0.8`.\n", + "If we keep reducing the threshold value, then we will start to observe mistakes in the found matches -- the precision decreases (if an entry in dataset A has no match in dataset B, but we keep reducing the threshold, eventually a comparison with an entry in B will be above the threshold leading to a false match). But at the same time the recall value will keep increasing for a while, as a lower threshold allows for more of the actual matches to be found. However, as our example dataset only contains matches (every entry in A has a match in B), this phenomenon cannot be observered. With the threshold `0.75` we identify all matches correctly (at the cost of a longer execution time)." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mEntity Matching Server: https://testing.es.data61.xyz\u001b[0m\n" + ] + } + ], + "source": [ + "!clkutil create --verbose \\\n", + " --server=\"{SERVER}\" \\\n", + " --output \"run_info.json\" \\\n", + " --threshold=0.75 \\\n", + " --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --name=\"CLI tutorial run B\"\n", + "\n", + "run_info = json.load(open('run_info.json', 'rt'))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "pycharm": {}, + "tags": [ + "nbval-ignore-output" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mState: running\n", + "Stage (2/3): compute similarity scores\u001b[0m\n", + "\u001b[31mState: running\n", + "Stage (2/3): compute similarity scores\u001b[0m\n", + "\u001b[31mState: running\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mDownloading result\u001b[0m\n", + "\u001b[31mReceived result\u001b[0m\n" + ] + } + ], + "source": [ + "!clkutil results --watch \\\n", + " --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --run=\"{run_info['run_id']}\" \\\n", + " --server=\"{SERVER}\" \\\n", + " --output results.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The service linked 4995 entities.\n", + "Precision: 1.00, Recall: 1.00, Accuracy: 1.00\n" + ] + } + ], + "source": [ + "with open('results.txt') as f:\n", + " str_mapping = json.load(f)['mapping']\n", "\n", - "Great, for this threshold value we get a precision of 100% and a recall of 95.3%. \n", + "mapping = {int(k): int(v) for k,v in str_mapping.items()}\n", "\n", - "The explanation is that when the information about an entity differs slightly in the two datasets (e.g. spelling errors, abbrevations, missing values, ...) then the corresponding CLKs will differ in some number of bits as well. For the datasets in this tutorial the perturbations are such that only 72.7% of the derived CLK pairs overlap more than 90%. Whereas almost all matching pairs overlap more than 80%.\n", + "print('The service linked {} entities.'.format(len(mapping)))\n", + "describe_accuracy(mapping)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": {} + }, + "source": [ + "It is important to choose an appropriate threshold for the amount of perturbations present in the data.\n", "\n", - "If we keep reducing the threshold value, then we will start to observe mistakes in the found matches -- the precision decreases. But at the same time the recall value will keep increasing for a while, as a lower threshold allows for more of the actual matches to be found, e.g.: for threshold 0.72, we get precision: 0.997 and recall: 0.992. However, reducing the threshold further will eventually lead to a decrease in both precision and recall: for threshold 0.65 precision is 0.983 and recall is 0.980. Thus it is important to choose an appropriate threshold for the amount of perturbations present in the data.\n", + "Feel free to go back to the CLK generation and experiment on how different setting will affect the matching quality.\n", "\n", - "This concludes the tutorial. Feel free to go back to the CLK generation and experiment on how different setting will affect the matching quality." + "Finally to remove the uploaded CLK data from the service delete the project:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 31, + "metadata": { + "pycharm": {} + }, "outputs": [], - "source": [] + "source": [ + "# TODO\n", + "\n", + "#!clkutil delete-project --project=\"{credentials['project_id']}\" \\\n", + "# --apikey=\"{credentials['result_token']}\" \\\n", + "# --run=\"{run_info['run_id']}\" \\\n", + "# --server=\"{SERVER}\"" + ] } ], "metadata": { @@ -861,7 +1365,16 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.3" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } } }, "nbformat": 4, diff --git a/docs/tutorial_sanitize.cfg b/docs/tutorial_sanitize.cfg new file mode 100644 index 00000000..5e16616d --- /dev/null +++ b/docs/tutorial_sanitize.cfg @@ -0,0 +1,23 @@ +[hashing-time-regex] +regex: \d{1,2}:\d{1,2}<\d{1,2}:\d{1,2} +replace: HASHING-TIME + +[hashing-rate-regex] +regex: \d{1,3}.\d{1,2}k*clk +replace: HASHING-RATE + +[service-status] +regex: \"project_count\":\s\d+,\s\"rate\":\s\d+, +replace: SERVICE-STATUS + +[newline] +regex: \r\n +replace: \n + +[token] +regex: [\"']\w{48}[\"'] +replace: TOKEN + +[run-status] +regex: State:.*\nStage.* +replace: RUN-STATUS diff --git a/requirements.txt b/requirements.txt index 8c3aa001..98ea6eb6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ cryptography==2.6.1 enum34==1.1.6; python_version < '3.4' future==0.17.1 futures==3.2.0; python_version < '3.2' -jsonschema==2.6.0 +jsonschema==2.6 mypy_extensions==0.4.1 pyblake2==1.1.2; python_version < '3.6' pytest==4.5.0 diff --git a/setup.py b/setup.py index a34499db..1bde7ce9 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ "futures>=3.1; python_version < '3.2'", # Backport from Py3.2 "mypy_extensions>=0.3", "pyblake2>=1.1.1; python_version < '3.6'", - "jsonschema>=2.6", + "jsonschema==2.6", "requests>=2.20", "tqdm>=4.24", "typing>=3.6; python_version < '3.5'", # Backport from Py3.5 diff --git a/tests/__init__.py b/tests/__init__.py index 213c625b..c78c855c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -3,25 +3,27 @@ import tempfile import clkhash -SIMPLE_SCHEMA_PATH = os.path.join( +TESTDATA = os.path.join( os.path.dirname(__file__), - 'testdata', - 'simple-schema.json' -) -SAMPLE_DATA_SCHEMA_PATH = os.path.join( - os.path.dirname(__file__), - 'testdata', - 'dirty-data-schema.json' + 'testdata' ) +SIMPLE_SCHEMA_PATH = os.path.join(TESTDATA, 'simple-schema.json') + +SAMPLE_DATA_SCHEMA_PATH = os.path.join(TESTDATA, 'dirty-data-schema.json') + +GOOD_SCHEMA_V1_PATH = os.path.join(TESTDATA, 'good-schema-v1.json') +GOOD_SCHEMA_V2_PATH = os.path.join(TESTDATA, 'good-schema-v2.json') +BAD_SCHEMA_V1_PATH = os.path.join(TESTDATA, 'bad-schema-v1.json') +BAD_SCHEMA_V2_PATH = os.path.join(TESTDATA, 'bad-schema-v2.json') + RANDOMNAMES_SCHEMA_PATH = os.path.join( - os.path.dirname(clkhash.__file__), - 'data', - 'randomnames-schema.json' + TESTDATA, + 'randomnames-schema-v2.json' ) -SAMPLE_DATA_PATH_1 = os.path.join(os.path.dirname(__file__), 'testdata', 'dirty_1000_50_1.csv') -SAMPLE_DATA_PATH_2 = os.path.join(os.path.dirname(__file__), 'testdata', 'dirty_1000_50_2.csv') +SAMPLE_DATA_PATH_1 = os.path.join(TESTDATA, 'dirty_1000_50_1.csv') +SAMPLE_DATA_PATH_2 = os.path.join(TESTDATA, 'dirty_1000_50_2.csv') class temporary_file(object): diff --git a/tests/test_cli.py b/tests/test_cli.py index 5ce35f71..ee96b53c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,7 +18,7 @@ import clkhash.cli from clkhash import randomnames -from tests import temporary_file, create_temp_file, SIMPLE_SCHEMA_PATH, RANDOMNAMES_SCHEMA_PATH +from tests import * class CLITestHelper(unittest.TestCase): @@ -100,7 +100,9 @@ class BasicCLITests(unittest.TestCase): def test_list_commands(self): runner = CliRunner() result = runner.invoke(clkhash.cli.cli, []) - for expected_command in 'hash', 'upload', 'create', 'results', 'generate', 'benchmark': + expected_commands = ['benchmark', 'create', 'create-project', 'generate', + 'hash', 'upload', 'results', 'validate-schema'] + for expected_command in expected_commands: assert expected_command in result.output def test_version(self): @@ -135,6 +137,41 @@ def test_bench(self): assert 'hashes in' in result.output +class TestSchemaValidationCommand(unittest.TestCase): + + @staticmethod + def validate_schema(schema_path): + runner = CliRunner() + result = runner.invoke(clkhash.cli.cli, [ + 'validate-schema', schema_path + ]) + return result + + def test_good_v1_schema(self): + for schema_path in GOOD_SCHEMA_V1_PATH, SIMPLE_SCHEMA_PATH: + result = self.validate_schema(schema_path) + assert result.exit_code == 0 + assert 'schema is valid' in result.output + + def test_bad_v1_schema(self): + result = self.validate_schema(BAD_SCHEMA_V1_PATH) + assert result.exit_code == -1 + assert 'schema is not valid.' in result.output + assert "'l' is a required property" in result.output + + + def test_good_v2_schema(self): + for schema_path in GOOD_SCHEMA_V2_PATH, RANDOMNAMES_SCHEMA_PATH: + result = self.validate_schema(schema_path) + assert result.exit_code == 0 + assert 'schema is valid' in result.output + + def test_bad_v1_schema(self): + result = self.validate_schema(BAD_SCHEMA_V2_PATH) + assert result.exit_code == -1 + assert 'schema is not valid.' in result.output + + @unittest.skipUnless("INCLUDE_CLI" in os.environ, "Set envvar INCLUDE_CLI to run. Disabled for jenkins") class TestHashCommand(unittest.TestCase): diff --git a/tests/test_field_formats.py b/tests/test_field_formats.py index 1ef756cd..3b55db8e 100644 --- a/tests/test_field_formats.py +++ b/tests/test_field_formats.py @@ -16,7 +16,7 @@ def test_string_regex(self): pattern=r'[5-9', # This is syntactically incorrect. description='foo'), hashing=dict( - ngram=1, k=20)) + ngram=1, strategy=dict(k=20))) # Make sure we don't accept bad regular expressions. with self.assertRaises(field_formats.InvalidSchemaError): @@ -91,7 +91,7 @@ def test_string_nonregex_from_json_dict(self): hashing=dict( ngram=1, positional=True, - k=20)) + strategy=dict(k=20))) spec = field_formats.spec_from_json_dict(spec_dict) @@ -202,7 +202,7 @@ def test_string_default_encoding_nonregex(self): hashing=dict( ngram=1, positional=True, - k=20)) + strategy=dict(k=20))) spec = field_formats.spec_from_json_dict(spec_dict) @@ -222,7 +222,7 @@ def test_string_default_encoding_regex(self): hashing=dict( ngram=1, positional=True, - k=20)) + strategy=dict(k=20))) spec = field_formats.spec_from_json_dict(spec_dict) @@ -239,17 +239,21 @@ def test_string_default_encoding_regex(self): self.assertEqual(spec.hashing_properties.encoding, 'utf-8') def test_integer(self): - regex_spec = dict( - identifier='Z', - format=dict( + json_spec = { + 'identifier': 'Z', + 'format': { # Missing 'minimum' and 'maximum'. - type='integer', - description='buzz'), - hashing=dict( - ngram=1, k=20, - positional=True)) + 'type': 'integer', + 'description': 'buzz' + }, + 'hashing': { + 'ngram': 1, + 'strategy': {'k': 20}, + 'positional': True + } + } - spec = field_formats.spec_from_json_dict(regex_spec) + spec = field_formats.spec_from_json_dict(json_spec) # `minimum` and `maximum` should be None. self.assertIsNone(spec.minimum) @@ -279,9 +283,10 @@ def test_integer(self): self.assertEqual('10', spec.format_value(int_str)) # Ok, let's put a 'minimum' and 'maximum' in. - regex_spec['format']['minimum'] = 8 - regex_spec['format']['maximum'] = 12 - spec = field_formats.spec_from_json_dict(regex_spec) + + json_spec['format']['minimum'] = 8 + json_spec['format']['maximum'] = 12 + spec = field_formats.spec_from_json_dict(json_spec) # These are too small, thus invalid. with self.assertRaises(field_formats.InvalidEntryError): @@ -311,24 +316,22 @@ def test_integer(self): self.assertEqual(spec.hashing_properties.k, 20) # check with missing values - regex_spec['hashing']['missingValue'] = dict(sentinel='None', replaceWith='42') - spec = field_formats.spec_from_json_dict(regex_spec) + json_spec['hashing']['missingValue'] = dict(sentinel='None', replaceWith='42') + spec = field_formats.spec_from_json_dict(json_spec) # validating the sentinel should work spec.validate('None') self.assertEqual('42', spec.hashing_properties.replace_missing_value('None')) def test_date(self): - regex_spec = dict( - identifier='dates', - format=dict( - type='date', - format='%Y-%m-%d', - description='phoenix dactylifera'), - hashing=dict( - ngram=0, - k=20)) + json_spec = { + 'identifier': 'dates', + 'format': { + 'type': 'date', 'format': '%Y-%m-%d', + 'description': 'phoenix dactylifera'}, + 'hashing': {'ngram': 0, 'strategy': {'k': 20}} + } - spec = field_formats.spec_from_json_dict(regex_spec) + spec = field_formats.spec_from_json_dict(json_spec) # These are valid dates. spec.validate('1946-06-14') @@ -407,7 +410,7 @@ def test_date_output_formatting(self): format=dict( type='date', format='%Y:%m-%d'), - hashing=dict(ngram=0, k=20)) + hashing=dict(ngram=0, strategy=dict(k=20))) spec = field_formats.spec_from_json_dict(regex_spec) from datetime import date @@ -418,15 +421,12 @@ def test_date_output_formatting(self): spec.format_value('yesterday') def test_enum(self): - spec_dict = dict( - identifier='testingAllTheEnums', - format=dict( - type='enum', - values=['dogs', 'cats', u'fërrets'], - description='fizz'), - hashing=dict( - ngram=2, - k=20)) + spec_dict = { + 'identifier': 'testingAllTheEnums', + 'format': { + 'type': 'enum', + 'values': ['dogs', 'cats', u'fërrets'], + 'description': 'fizz'}, 'hashing': {'ngram': 2, 'strategy': {'k': 20}}} spec = field_formats.spec_from_json_dict(spec_dict) diff --git a/tests/test_missingValue_integration.py b/tests/test_missingValue_integration.py index b0d749b7..1271d580 100644 --- a/tests/test_missingValue_integration.py +++ b/tests/test_missingValue_integration.py @@ -1,24 +1,63 @@ from clkhash import schema from clkhash.clk import generate_clks +import json def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. - schema_dict = dict(version=1, - clkConfig=dict(l=1024, k=20, hash=dict(type='doubleHash'), kdf=dict(type='HKDF')), - features=[ - dict(identifier='name', - format=dict(type='string', encoding='utf-8'), - hashing=dict(ngram=2, missingValue=dict(sentinel='null', replaceWith='Bob'))), - dict(identifier='age', - format=dict(type='integer'), - hashing=dict(ngram=1, missingValue=dict(sentinel='NA', replaceWith='42'))) - ]) + + schema_json = """ + { + "version": 2, + "clkConfig": { + "l": 1024, + "kdf": { + "type": "HKDF" + } + }, + "features": [ + { + "identifier": "name", + "format": { + "type": "string", + "encoding": "utf-8" + }, + "hashing": { + "ngram": 2, + "strategy": { + "k": 20 + }, + "missingValue": { + "sentinel": "null", + "replaceWith": "Bob" + } + } + }, + { + "identifier": "age", + "format": { + "type": "integer" + }, + "hashing": { + "ngram": 1, + "strategy": { + "k": 20 + }, + "missingValue": { + "sentinel": "NA", + "replaceWith": "42" + } + } + } + ] + } + """ + schema_dict = json.loads(schema_json) s = schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2')) assert len(clks) == 2 - assert clks[0] == clks[1] \ No newline at end of file + assert clks[0] == clks[1] diff --git a/tests/test_schema.py b/tests/test_schema.py index 9456430a..9300436f 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -114,7 +114,7 @@ def test_validation_of_illdefined_not_ignored_feature(self): schema.from_json_dict(schema_dict) exception = contextmanager.exception - self.assertIsInstance(exception.__cause__, ValidationError) + self.assertIsInstance(exception, SchemaError) class TestSchemaLoading(unittest.TestCase): diff --git a/tests/testdata/bad-schema-v2.json b/tests/testdata/bad-schema-v2.json new file mode 100644 index 00000000..7edc8211 --- /dev/null +++ b/tests/testdata/bad-schema-v2.json @@ -0,0 +1,63 @@ +{ + "version": 2, + "clkConfig": { + "l": 1024, + "kdf": { + "type": "HKDF", + "hash": "SHA256", + "salt": "SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==", + "info": "c2NoZW1hX2V4YW1wbGU=", + "keySize": 64 + } + }, + "features": [ + { + "identifier": "INDEX", + "ignored": false + }, + { + "identifier": "NAME freetext", + "format": { + "type": "string", + "encoding": "utf-8", + "case": "mixed", + "minLength": 3 + }, + "hashing": { + "ngram": 2, + "strategy": { + "k": 15 + }, + "hash": {"type": "doubleHash"} + } + }, + { + "identifier": "DOB YYYY/MM/DD", + "format": { + "type": "date", + "description": "Numbers separated by slashes, in the year, month, day order", + "format": "%Y/%m/%d" + }, + "hashing": { + "ngram": 1, + "positional": true, + "k": 30, + "hash": {"type": "doubleHash"} + } + }, + { + "identifier": "GENDER M or F", + "format": { + "type": "enum", + "values": ["M", "F"] + }, + "hashing": { + "ngram": 1, + "strategy": { + "k": 60 + }, + "hash": {"type": "doubleHash"} + } + } + ] +} diff --git a/tests/testdata/good-schema-v2.json b/tests/testdata/good-schema-v2.json index b1d9d517..4331585c 100644 --- a/tests/testdata/good-schema-v2.json +++ b/tests/testdata/good-schema-v2.json @@ -27,7 +27,9 @@ "hashing": { "hash": {"type": "doubleHash"}, "ngram": 2, - "numBits": 200 + "strategy": { + "numBits": 200 + } } }, { @@ -42,7 +44,9 @@ "hashing": { "hash": {"type": "blakeHash"}, "ngram": 2, - "k": 20 + "strategy": { + "k": 20 + } } }, { @@ -54,7 +58,8 @@ "case": "upper" }, "hashing": { - "ngram": 2 + "ngram": 2, + "strategy": {"numBits": 200} } }, { @@ -67,6 +72,7 @@ }, "hashing": { "ngram": 1, + "strategy": {"numBits": 200}, "positional": true } }, @@ -79,6 +85,7 @@ }, "hashing": { "ngram": 1, + "strategy": {"numBits": 200}, "positional": true } }, @@ -90,7 +97,8 @@ "case": "lower" }, "hashing": { - "ngram": 2 + "ngram": 2, + "strategy": {"numBits": 200} } }, { @@ -101,7 +109,8 @@ "description": "'O' for other. If unknown leave empty." }, "hashing": { - "ngram": 1 + "ngram": 1, + "strategy": {"numBits": 100} } }, { @@ -114,6 +123,7 @@ }, "hashing": { "ngram": 1, + "strategy": {"numBits": 300}, "positional": true } }, @@ -127,7 +137,8 @@ "description": "Month and day of birth (eg: 1125)" }, "hashing": { - "ngram": 2 + "ngram": 2, + "strategy": {"numBits": 200} } }, { @@ -140,6 +151,7 @@ }, "hashing": { "ngram": 1, + "strategy": {"numBits": 200}, "positional": true } }, @@ -152,6 +164,7 @@ }, "hashing": { "ngram": 1, + "strategy": {"numBits": 100}, "positional": true } } diff --git a/tests/testdata/randomnames-schema-num-bits-v2.json b/tests/testdata/randomnames-schema-num-bits-v2.json index d5c164e9..fa2e403b 100644 --- a/tests/testdata/randomnames-schema-num-bits-v2.json +++ b/tests/testdata/randomnames-schema-num-bits-v2.json @@ -25,7 +25,9 @@ }, "hashing": { "ngram": 2, - "numBits": 100, + "strategy": { + "numBits": 100 + }, "hash": {"type": "doubleHash"} } }, @@ -39,7 +41,9 @@ "hashing": { "ngram": 1, "positional": true, - "numBits": 200, + "strategy": { + "numBits": 200 + }, "hash": {"type": "doubleHash"} } }, @@ -51,7 +55,9 @@ }, "hashing": { "ngram": 1, - "numBits": 400, + "strategy": { + "numBits": 400 + }, "hash": {"type": "doubleHash"} } } diff --git a/tests/testdata/randomnames-schema-v2.json b/tests/testdata/randomnames-schema-v2.json index 67a68fb4..abf7f55f 100644 --- a/tests/testdata/randomnames-schema-v2.json +++ b/tests/testdata/randomnames-schema-v2.json @@ -25,7 +25,9 @@ }, "hashing": { "ngram": 2, - "k": 15, + "strategy": { + "k": 15 + }, "hash": {"type": "doubleHash"} } }, @@ -39,7 +41,9 @@ "hashing": { "ngram": 1, "positional": true, - "k": 30, + "strategy": { + "k": 30 + }, "hash": {"type": "doubleHash"} } }, @@ -51,7 +55,9 @@ }, "hashing": { "ngram": 1, - "k": 60, + "strategy": { + "k": 60 + }, "hash": {"type": "doubleHash"} } }