Skip to content

Commit 2f9f9f1

Browse files
authored
Updates to schema v2 (#255)
* Make the v2 schema stricter with additional properties * Fix the v2 testdata schemas to be compliant with the specification * Fix the hardcoded feature configs in the unit tests to be compliant with the spec * Bugfix in converting schemas from v1 to v2 * Update documentation regarding linkage schema v2 * Add a v2 schema with errors * Invalid schema exceptions keep more context * Add cli command to validate a schema and cli schema validation tests * Avoid a divide by zero when ngrams is empty * cli can throw nice errors when asked to hash with invalid schema * Travis: notebook execution runs in the integration test stage * Expose Schema at top level * Don't include a default hashing strategy in the schema * Update tutorial notebooks to use v2 schema (#257) * Include a notebook sanitizer so travis can test cells match. * Use the new version of anonlink in api tutorial. * fine-tuned schemas for jaw dropping performance demonstration * Schema id now includes version * Update author attribute * Note we don't update to latest jsonschema version as Windows exe built with PyInstaller failed. Closes #254
1 parent 29d8da7 commit 2f9f9f1

29 files changed

+1865
-744
lines changed

.travis.yml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ install:
2222
- travis_retry pip install -e .
2323

2424
script:
25-
- if [ "${INCLUDE_NB_TEST}" == "1" ]; then pytest --cov=clkhash --nbval-lax; else pytest --cov=clkhash; fi
25+
- pytest --cov=clkhash
2626
- codecov
2727

2828

@@ -48,28 +48,33 @@ jobs:
4848
- python: '3.6'
4949
env:
5050
- INCLUDE_CLI=1
51-
- INCLUDE_NB_TEST=1
5251
- python: '2.7'
5352
env:
5453
- INCLUDE_CLI=1
55-
5654
# OSX + Python is officially supported by Travis CI as of April 2011
5755
# https://docs.travis-ci.com/user/reference/osx/
5856
- os: osx
5957
osx_image: xcode8.3
6058
python: "3.6-dev"
6159

60+
- stage: Integration
61+
name: Test Notebooks
62+
python: 3.7
63+
before_install:
64+
- travis_retry pip install -U -r docs/doc-requirements.txt
65+
script:
66+
- pytest --nbval docs -x --sanitize-with docs/tutorial_sanitize.cfg
67+
6268
- stage: Integration
6369
python: '3.8-dev'
6470
env:
6571
- TEST_ENTITY_SERVICE=https://testing.es.data61.xyz
6672
- INCLUDE_CLI=1
6773
- stage: Integration
68-
python: '3.6'
74+
python: '3.7'
6975
env:
7076
- TEST_ENTITY_SERVICE=https://testing.es.data61.xyz
7177
- INCLUDE_CLI=1
72-
- INCLUDE_NB_TEST=1
7378
- stage: Integration
7479
python: '2.7'
7580
env:

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
## 0.13.0
22

3+
- Fix example and test linkage schemas using v2.
34
- Fix mismatch between double hash and blake hash key requirement.
45
- Update to use newer anonlink-entity-service api.
56
- Updates to dependencies.

clkhash/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import pkg_resources
22

33
from . import bloomfilter, field_formats, key_derivation, schema, randomnames, describe
4+
from .schema import Schema
45

56
try:
67
__version__ = pkg_resources.get_distribution('clkhash').version
78
except pkg_resources.DistributionNotFound:
89
__version__ = "development"
910

10-
__author__ = 'N1 Analytics'
11+
__author__ = "Data61"

clkhash/bloomfilter.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,9 +319,10 @@ def crypto_bloom_filter(record, # type: Sequence[Text]
319319
if fhp:
320320
ngrams = list(tokenize(field.format_value(entry)))
321321
hash_function = hashing_function_from_properties(fhp)
322-
bloomfilter |= hash_function(ngrams, key,
323-
fhp.ks(len(ngrams)),
324-
hash_l, fhp.encoding)
322+
if ngrams:
323+
bloomfilter |= hash_function(ngrams, key,
324+
fhp.ks(len(ngrams)),
325+
hash_l, fhp.encoding)
325326

326327
c1 = bloomfilter.count()
327328
bloomfilter = fold_xor(bloomfilter, schema.xor_folds)

clkhash/cli.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
run_get_status, project_create, run_create,
1616
server_get_status, ServiceError,
1717
format_run_status, watch_run_status)
18+
from clkhash.schema import SchemaError
1819

1920
DEFAULT_SERVICE_URL = 'https://es.data61.xyz'
2021

@@ -68,8 +69,11 @@ def hash(pii_csv, keys, schema, clk_json, quiet, no_header, check_header, valida
6869
6970
Use "-" for CLK_JSON to write JSON to stdout.
7071
"""
71-
72-
schema_object = clkhash.schema.from_json_file(schema_file=schema)
72+
try:
73+
schema_object = clkhash.schema.from_json_file(schema_file=schema)
74+
except SchemaError as e:
75+
log(str(e))
76+
raise SystemExit(-1)
7377
header = True
7478
if not check_header:
7579
header = 'ignore'
@@ -92,7 +96,7 @@ def hash(pii_csv, keys, schema, clk_json, quiet, no_header, check_header, valida
9296
log("CLK data written to {}".format(clk_json.name))
9397

9498

95-
@cli.command('status', short_help='Get status of entity service')
99+
@cli.command('status', short_help='get status of entity service')
96100
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
97101
@click.option('-o', '--output', type=click.File('w'), default='-')
98102
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
@@ -141,7 +145,7 @@ def status(server, output, verbose):
141145
@click.option('--name', type=str, help="Name to give this project")
142146
@click.option('--parties', default=2, type=int,
143147
help="Number of parties in the project")
144-
@click.option('-o','--output', type=click.File('w'), default='-')
148+
@click.option('-o', '--output', type=click.File('w'), default='-')
145149
@click.option('-v', '--verbose', is_flag=True, help="Script is more talkative")
146150
def create_project(type, schema, server, name, parties, output, verbose):
147151
"""Create a new project on an entity matching server.
@@ -171,7 +175,7 @@ def create_project(type, schema, server, name, parties, output, verbose):
171175
except ServiceError as e:
172176
log("Unexpected response - {}".format(e.status_code))
173177
log(e.text)
174-
raise SystemExit
178+
raise SystemExit(-1)
175179
else:
176180
log("Project created")
177181

@@ -318,6 +322,28 @@ def generate_default_schema(output):
318322
shutil.copyfile(original_path, output)
319323

320324

325+
@cli.command('validate-schema', short_help="validate linkage schema")
326+
@click.argument('schema', type=click.File('r', lazy=True))
327+
def validate_schema(schema):
328+
"""Validate a linkage schema
329+
330+
Given a file containing a linkage schema, verify the schema is valid otherwise
331+
print detailed errors.
332+
"""
333+
334+
try:
335+
clkhash.schema.from_json_file(
336+
schema_file=schema,
337+
validate=True
338+
)
339+
340+
log("schema is valid", color='green')
341+
342+
except SchemaError as e:
343+
log(str(e))
344+
raise SystemExit(-1)
345+
346+
321347
if __name__ == "__main__":
322348
freeze_support()
323349
cli()

clkhash/data/randomnames-schema.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
{
32
"version": 1,
43
"clkConfig": {

clkhash/field_formats.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@ class InvalidEntryError(ValueError):
2525

2626

2727
class InvalidSchemaError(ValueError):
28-
""" The schema is not valid.
28+
"""Raised if the schema of a field specification is invalid.
2929
30-
This exception is raised if, for example, a regular expression
31-
included in the schema is not syntactically correct.
30+
For example, a regular expression included in the schema is not
31+
syntactically correct.
3232
"""
33+
json_field_spec = None # type: Optional[dict]
34+
field_spec_index = None # type: Optional[int]
3335

3436

3537
class MissingValueSpec(object):
@@ -161,19 +163,17 @@ def fhp_from_json_dict(
161163
"""
162164
Make a :class:`FieldHashingProperties` object from a dictionary.
163165
164-
:param dict json_dict:
165-
The dictionary must have have an 'ngram' key
166-
and one of k or num_bits. It may have
167-
'positional' key; if missing a default is used.
168-
The encoding is
169-
always set to the default value.
170-
:return: A :class:`FieldHashingProperties` instance.
166+
:param dict json_dict:
167+
Conforming to the `hashingConfig` definition
168+
in the `v2` linkage schema.
169+
:return: A :class:`FieldHashingProperties` instance.
171170
"""
171+
hashing_strategy = json_dict['strategy']
172172
h = json_dict.get('hash', {'type': 'blakeHash'})
173-
num_bits = json_dict.get('numBits')
174-
k = json_dict.get('k')
175-
if not num_bits and not k:
176-
num_bits = 200 # default for v2 schema
173+
174+
num_bits = hashing_strategy.get('numBits')
175+
k = hashing_strategy.get('k')
176+
177177
return FieldHashingProperties(
178178
ngram=json_dict['ngram'],
179179
positional=json_dict.get(
@@ -263,7 +263,6 @@ def validate(self, str_in):
263263
e_new.field_spec = self
264264
raise_from(e_new, err)
265265

266-
267266
def is_missing_value(self, str_in):
268267
# type: (Text) -> bool
269268
""" tests if 'str_in' is the sentinel value for this field
@@ -441,6 +440,7 @@ def from_json_dict(cls,
441440
except (SyntaxError, re.error) as e:
442441
msg = "Invalid regular expression '{}.'".format(pattern)
443442
e_new = InvalidSchemaError(msg)
443+
e_new.json_field_spec = json_dict
444444
raise_from(e_new, e)
445445
result.regex_based = True
446446

@@ -843,9 +843,10 @@ def spec_from_json_dict(
843843
json_dict # type: Dict[str, Any]
844844
):
845845
# type: (...) -> FieldSpec
846-
""" Turns a dictionary into the appropriate object.
846+
""" Turns a dictionary into the appropriate FieldSpec object.
847847
848848
:param dict json_dict: A dictionary with properties.
849+
:raises InvalidSchemaError:
849850
:returns: An initialised instance of the appropriate FieldSpec
850851
subclass.
851852
"""

0 commit comments

Comments
 (0)