diff --git a/INSTALL.rst b/INSTALL.rst index 11e4437..9f96252 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -1,2 +1,6 @@ Installation ============ + +json-merger is on PyPI so all you need is: :: + + pip install json-merger diff --git a/README.rst b/README.rst index fe7719f..bce35de 100644 --- a/README.rst +++ b/README.rst @@ -41,9 +41,10 @@ :target: https://github.com/inspirehep/json-merger/blob/master/LICENSE -Invenio module that is able to merge json record objects. - -*This is an experimental developer preview release.* +Module for merging JSON Objects. * Free software: GPLv2 license * Documentation: https://pythonhosted.org/json-merger/ + +* For the simple use-case: ``pip install json-merger`` +* To install contrib dependencies: ``pip install json-merger[contrib]`` diff --git a/docs/api.rst b/docs/api.rst index e7bc7c0..128162a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -27,4 +27,29 @@ API Docs json_merger ----------- - +.. autoclass:: json_merger.Merger + +json_merger.config +------------------ +.. automodule:: json_merger.config + :members: + +json_merger.comparator +---------------------- +.. automodule:: json_merger.comparator + :members: + +json_merger.stats +----------------- +.. automodule:: json_merger.stats + :members: + +json_merger.errors +------------------ +.. automodule:: json_merger.errors + :members: + +json_merger.conflict +-------------------- +.. automodule:: json_merger.conflict + :members: diff --git a/docs/conf.py b/docs/conf.py index e0b61be..62d9bab 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,6 +52,7 @@ def _warn_node(self, msg, node): 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', ] # Add any paths that contain templates here, relative to this directory. @@ -137,8 +138,8 @@ def _warn_node(self, msg, node): html_theme = 'alabaster' html_theme_options = { - 'description': 'Invenio module that is able to merge json record objects.', - 'github_user': 'inveniosoftware', + 'description': 'Module for merging JSON objects.', + 'github_user': 'inspirehep', 'github_repo': 'json-merger', 'github_button': False, 'github_banner': True, @@ -336,3 +337,6 @@ def _warn_node(self, msg, node): # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'https://docs.python.org/': None} + +# Also document __init__ +autoclass_content = 'both' diff --git a/json_merger/__init__.py b/json_merger/__init__.py index c719690..898ec9a 100644 --- a/json_merger/__init__.py +++ b/json_merger/__init__.py @@ -22,17 +22,406 @@ # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. -"""Invenio module that is able to merge json record objects.""" +"""Module for merging JSON objects. + +To use this module you need to first import the main class: + +>>> from json_merger import Merger + +Then, import the configuration options: + +>>> from json_merger.config import UnifierOps, DictMergerOps + +The Basic Use Case +------------------ + +Let's assume we have JSON records that don't have any list fields -- +They have string keys and as values other objects or primitive types. +In order to perform a merge we assume we have a lowest common ancestor +(``root``), a current version (``head``) and another version wich we want to +integrate into our record (``update``). + +>>> root = {'name': 'John'} # A common ancestor of our person record +>>> head = {'name': 'Johnny', 'age': 32} # The current version of the record. +>>> update = {'name': 'Jonathan', 'address': 'Home'} # An updated version. + +In this case we want to use the merger to compute one of the possible versions. + +We create a merger instance in which we provide the default operation for +non-list fields and the one for list fields. + +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST) +... # Ignore UnifierOps for now. +>>> # We might get some exceptions +>>> from json_merger.errors import MergeError +>>> try: +... m.merge() +... except MergeError: +... pass # We don't care about this now. +>>> m.merged_root == { +... 'name': 'Johnny', +... 'age': 32, +... 'address': 'Home', +... } +True + +The merged version kept the ``age`` field from the ``head`` object and the +``address`` field from the ``update`` object. The ``name`` field was different, +but because the strategy was ``FALLBACK_KEEP_HEAD`` the end result kept the +value from the ``head`` variable. To keep the ``update`` one, one can +use ``FALLBACK_KEEP_UPDATE``: + +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_UPDATE, +... UnifierOps.KEEP_ONLY_HEAD_ENTITIES) +>>> rasised_something = False +>>> try: +... m.merge() +... except MergeError: +... raised_something = True +>>> m.merged_root == { +... 'name': 'Jonathan', +... 'age': 32, +... 'address': 'Home', +... } +True + +If this type of conflict occurs, the merger will also populate a ``conflicts`` +field. In this case the conflict holds the alternative name for our record. +Also, because a conflict occurred, the merge method also raised a MergeError. + +For all the types of conflict that can be raised by the ``merge`` method +also check the :class:`json_merger.conflict.ConflictType` documentation. + +>>> from json_merger.conflict import Conflict, ConflictType +>>> m.conflicts[0] == Conflict(ConflictType.SET_FIELD, ('name', ), 'Johnny') +True +>>> raised_something +True + +Merging Lists With Base Values +------------------------------ + +For this example we are going to assume we want to merge sets of badges +that a person can receive. + +>>> root = {'badges': ['bad', 'random']} +>>> head = {'badges': ['cool', 'nice', 'random']} +>>> update = {'badges': ['fun', 'nice', 'healthy']} + +The most simple options are to either keep only the badges available in head +or only the badges available in the update. This can be done by specifying one +of: + + * ``UnifierOps.KEEP_ONLY_HEAD_ENTITIES`` + * ``UnifierOps.KEEP_ONLY_UPDATE_ENTITIES`` + +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_ONLY_HEAD_ENTITIES) +>>> m.merge() # No conflict here +>>> m.merged_root['badges'] == ['cool', 'nice', 'random'] +True +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) +>>> m.merge() +>>> m.merged_root['badges'] == ['fun', 'nice', 'healthy'] +True + +If we want to do a union of the elements we can use: + + * ``UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST`` + * ``UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST`` + +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST) +>>> m.merge() # No conflict here +>>> m.merged_root['badges'] == ['cool', 'fun', 'nice', 'random', 'healthy'] +True +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST) +>>> m.merge() +>>> m.merged_root['badges'] == ['fun', 'cool', 'nice', 'healthy', 'random'] +True + +These options keep the order relations between the entities. For example, +both ``'fun'`` and ``'cool'`` were placed before the ``'nice'`` entity but +between them there isn't any restriction. In such cases, for +``KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST`` we first pick the elements +that occur only in the `head` list and for +``KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST`` we first pick the ones that +occur only in the `update` list. If no such ordering is possible we first +add the elements found in the prioritized list and then the remaining ones. +Also, the method will raise a REORDER conflict. + +>>> m = Merger([], [1, 2, 5, 3], [3, 1, 2, 4], +... DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST) +>>> try: +... m.merge() +... except MergeError: +... pass +>>> m.merged_root == [1, 2, 5, 3, 4] +True +>>> m.conflicts == [Conflict(ConflictType.REORDER, (), None)] +True +>>> m = Merger([], [1, 2, 5, 3], [3, 1, 2, 4], +... DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST) +>>> try: +... m.merge() +... except MergeError: +... pass +>>> m.merged_root == [3, 1, 2, 4, 5] +True +>>> m.conflicts == [Conflict(ConflictType.REORDER, (), None)] +True + +In the case in which ``root`` is represented by the latest automatic update +of a record (e.g. crawling some metadata source), +``head`` by manual edits of ``root`` and ``update`` by a new automatic +update, we might want to preserve only the entities in ``update`` but +notify the user in case some manual addition was removed. + + * ``UnifierOps.KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE`` + +>>> root = {'badges': ['bad', 'random']} +>>> head = {'badges': ['cool', 'nice', 'random']} +>>> update = {'badges': ['fun', 'nice', 'healthy']} +>>> m = Merger(root, head, update, DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE) +>>> try: +... m.merge() +... except MergeError: +... pass +>>> m.merged_root['badges'] == ['fun', 'nice', 'healthy'] +True +>>> m.conflicts == [Conflict(ConflictType.ADD_BACK_TO_HEAD, +... ('badges', ), 'cool')] +True + +In this case, only ``'cool'`` was added "manually" and removed by the update. + + +Merging Lists Of Objects +------------------------ + +Assume the most complex case in which we need to merge lists of objects which +can also contain nested lists. + +>>> root = { +... 'people': [ +... {'name': 'John', 'age': 13}, +... {'name': 'Peter'}, +... {'name': 'Max'} +... ]} +>>> head = { +... 'people': [ +... {'name': 'John', 'age': 14, +... 'group': {'id': 'grp01'}, +... 'person_id': '42', +... 'badges': [{'t': 'b0', 'e': True}, {'t': 'b1'}, {'t': 'b2'}]}, +... {'name': 'Peter', 'age': 15, +... 'badges': [{'t': 'b0'}, {'t': 'b1'}, {'t': 'b2'}]}, +... {'name': 'Max', 'age': 16} +... ]} +>>> update = { +... 'people': [ +... {'name': 'Max', 'address': 'work'}, +... {'name': 'Johnnie', 'address': 'home', +... 'group': {'id': 'GRP01'}, +... 'person_id': '42', +... 'age': 15, +... 'badges': [{'t': 'b1'}, {'t': 'b2'}, {'t': 'b0', 'e': False}]}, +... ]} + +First of all we would like to define how to person records represent the same +entity. In this demo data model we can say that two records represent the +same person if any of the following is true: + + * They have the same ``name`` + * They have the same lowercased group id AND the same person_id + +Then we define two badges as equal if they have the same ``t`` attribute. + +In order to define a custom mode of linking records you can add comparator +classes for any of the list fields via the coparators keyword argument. +To define a simple comparsion that checks field equality you +can use :class:`json_merger.comparator.PrimaryKeyComparator` + +In this case the fields from above look like this: + +>>> from json_merger.comparator import PrimaryKeyComparator +>>> class PersonComparator(PrimaryKeyComparator): +... primary_key_fields = ['name', ['group.id', 'person_id']] +... normalization_functions = {'group.id': str.lower} +>>> class BadgesComparator(PrimaryKeyComparator): +... primary_key_fields = ['t'] + +Note: + You need to use a comparator class and not a comparator instance when + defining the equality of two objects. + +Next we would like to define how to do the merging: + + * In case of conflict keep ``head`` values. + * For every list try to keep only the update entities. + * For the badges list keep both entities with priority to the ``update`` + values. + +>>> comparators = {'people': PersonComparator, +... 'people.badges': BadgesComparator} +>>> list_merge_ops = { +... 'people.badges': UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST +... } +>>> m = Merger(root, head, update, +... DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_ONLY_UPDATE_ENTITIES, +... comparators=comparators, +... list_merge_ops=list_merge_ops) +>>> try: +... m.merge() +... except MergeError: +... pass +>>> m.merged_root == { +... 'people': [ +... {'name': 'Max', 'address': 'work', 'age': 16}, +... {'name': 'Johnnie', # Only update edited it. +... 'address': 'home', +... 'group': {'id': 'grp01'}, # From KEEP_HEAD +... 'person_id': '42', +... 'age': 14, # From KEEP_HEAD +... 'badges': [{'t': 'b1'}, {'t': 'b2'}, +... {'t': 'b0', 'e': True}], # From KEEP_HEAD +... }, +... ]} +True + +Merging Data Lists +------------------ + +If you want to merge arrays of raw data (that do not encode any entities), +you can use the ``data_lists`` keyword argument. This argument treats +list indices as dictionary keys. + +>>> root = {'f': {'matrix': [[0, 0], [0, 0]]}} +>>> head = {'f': {'matrix': [[1, 1], [0, 0]]}} +>>> update = {'f': {'matrix': [[0, 0], [1, 1]]}} +>>> m = Merger(root, head, update, +... DictMergerOps.FALLBACK_KEEP_HEAD, +... UnifierOps.KEEP_ONLY_UPDATE_ENTITIES, +... data_lists=['f.matrix']) +>>> m.merge() +>>> m.merged_root == {'f': {'matrix': [[1, 1], [1, 1]]}} +True + +Extending Comparators +--------------------- + +The internal API uses classes that extend +:class:`json_merger.comparator.BaseComparator` in order to check the semantic +equality of JSON objects. The interals call the ``get_matches`` method which +is implemented in terms of the ``equals`` method. The most simple method to +extend this class is to override the ``equals`` method. + +>>> from json_merger.comparator import BaseComparator +>>> class CustomComparator(BaseComparator): +... def equal(self, obj1, obj2): +... return abs(obj1 - obj2) < 0.2 +>>> comp = CustomComparator([1, 2], [1, 2, 1.1]) +>>> comp.get_matches('l1', 0) # elements matching l1[0] from l2 +[(0, 1), (2, 1.1)] + +If you want to implement another type of asignment you an compute all the +mathes and store them in the ``matches`` set by overriding the +``process_lists`` method. You need to put pairs of matching indices between +l1 and l2. + +>>> from json_merger.comparator import BaseComparator +>>> class CustomComparator(BaseComparator): +... def process_lists(self): +... self.matches.add((0, 0)) +... self.matches.add((0, 1)) +>>> comp = CustomComparator(['foo', 'bar'], ['bar', 'foo']) +>>> comp.get_matches('l1', 0) # elements matching l1[0] from l2 +[(0, 'bar'), (1, 'foo')] + +[contrib] Distance Function Matching +------------------------------------ + +To implement fuzzy matching we also allow matching by using a distane +function. This ensures a 1:1 mapping betwen the entities by minimizing +the total distance between all linked entities. To mark two of them +as equal you can provide a threshold for that distance. (This is why +it's best to normalize it between 0 and 1). Also, for speeding +up the matching you also can hint possible matches by bucketing matching +elements using a normalization function. In the next example we would +match some points in the coordinate system, each of them lying in a specific +square. The distance that we are going to use is the euclidean distance. +We will normalize the points to their integer counterpart. + +>>> from json_merger.contrib.inspirehep.comparators import ( +... DistanceFunctionComparator) +>>> from math import sqrt +>>> class PointComparator(DistanceFunctionComparator): +... distance_function = lambda p1, p2: sqrt((p1[0] - p2[0]) ** 2 + +... (p1[1] - p2[1]) ** 2) +... normalization_functions = [lambda p: (int(p[0]), int(p[1]))] +... threshold = 0.5 +>>> l1 = [(1.1, 1.1), (1.2, 1.2), (2.1, 2.1)] +>>> l2 = [(1.11, 1.11), (1.25, 1.25), (2.15, 2.15)] +>>> comp = PointComparator(l1, l2) +>>> comp.get_matches('l1', 0) # elements matching l1[0] from l2 +[(0, (1.11, 1.11))] +>>> # match only the closest element, not everything under threshold. +>>> comp.get_matches('l1', 1) +[(1, (1.25, 1.25))] +>>> comp.get_matches('l1', 2) +[(2, (2.15, 2.15))] + +[contrib] Custom Person Name Distance +------------------------------------- + +We also provide a person name distance based on edit distance normalized +between 0 and 1. You just need to provide a function for tokenizing a full +name into NameToken or NameInitial - check ``simple_tokenize`` in the +contrib directory. This distance function matches initials with full +regular tokens and works with any name permutation. Also, this distance +calculator assumes the full name is inside the ``full_name`` field of a +dictionary. If you have the name in a different field you can just override +the class and call ``super`` on objects having the name in the ``full_name`` +field. + +>>> from json_merger.contrib.inspirehep.author_util import ( +... AuthorNameDistanceCalculator, simple_tokenize) +>>> dst = AuthorNameDistanceCalculator(tokenize_function=simple_tokenize) +>>> dst({'full_name': u'Doe, J.'}, {'full_name': u'John, Doe'}) < 0.1 +True + +Also we have functions for normalizing an author name with different +heuristics to speed up the distance function matching. + +>>> from json_merger.contrib.inspirehep.author_util import ( +... AuthorNameNormalizer) +>>> identity = AuthorNameNormalizer(simple_tokenize) +>>> identity({'full_name': 'Doe, Johnny Peter'}) +('doe', 'johnny', 'peter') +>>> one_fst_name = AuthorNameNormalizer(simple_tokenize, +... first_names_number=1) +>>> one_fst_name({'full_name': 'Doe, Johnny Peter'}) +('doe', 'johnny') +>>> last_name_one_initial = AuthorNameNormalizer(simple_tokenize, +... first_names_number=1, +... first_name_to_initial=True) +>>> last_name_one_initial({'full_name': 'Doe, Johnny Peter'}) +('doe', 'j') + +These instances can be used as class parameters for +``DistanceFunctionComparator`` +""" from __future__ import absolute_import, print_function -from .dict_merger import DictMergerOps -from .list_unify import UnifierOps -from .errors import MergeError -from .merger import ListAlignMerger, UpdateMerger +from .merger import Merger from .version import __version__ -__all__ = ('__version__', - 'DictMergerOps', 'UnifierOps', - 'ListAlignMerger', 'UpdateMerger', - 'MergeError') +__all__ = ('__version__', 'Merger') diff --git a/json_merger/comparator.py b/json_merger/comparator.py index b0d02b2..c4fce06 100644 --- a/json_merger/comparator.py +++ b/json_merger/comparator.py @@ -29,17 +29,28 @@ class BaseComparator(object): + """Abstract base class for Entity Comparison.""" def __init__(self, l1, l2): + """ + Args: + l1: First list of entities. + l2: Second list of entities. + """ self.l1 = l1 self.l2 = l2 + self.matches = set() self.process_lists() def process_lists(self): """Do any preprocessing of the lists.""" - pass + for l1_idx, obj1 in enumerate(self.l1): + for l2_idx, obj2 in enumerate(self.l2): + if self.equal(obj1, obj2): + self.matches.add((l1_idx, l2_idx)) - def equal(self, idx_l1, idx_l2): + def equal(self, obj1, obj2): + """Implementation of object equality.""" raise NotImplementedError() def get_matches(self, src, src_idx): @@ -55,8 +66,8 @@ def get_matches(self, src, src_idx): else: target_list = self.l1 comparator = { - 'l1': lambda s_idx, t_idx: self.equal(s_idx, t_idx), - 'l2': lambda s_idx, t_idx: self.equal(t_idx, s_idx) + 'l1': lambda s_idx, t_idx: (s_idx, t_idx) in self.matches, + 'l2': lambda s_idx, t_idx: (t_idx, s_idx) in self.matches, }[src] return [(trg_idx, obj) for trg_idx, obj in enumerate(target_list) @@ -64,25 +75,28 @@ def get_matches(self, src, src_idx): class PrimaryKeyComparator(BaseComparator): - """Renders two objects as equal if they have the same primary key. + """Considers two objects as equal if they have the same primary key. If two objects have at least one of the configured primary_key_fields equal then they are equal. A primary key field can be any of: - string: Two objects are equal if the values at the given key paths - are equal. Example: - For 'key1.key2' the objects are equal if - obj1['key1']['key2'] == obj2['key1']['key2']. - list: Two objects are equal if all the values at the key paths - in the list are equal. Example: - For ['key1', 'key2.key3'] the objects are equal if - obj1['key1'] == obj2['key1'] and - obj1['key2']['key3'] == obj2['key2']['key3']. + + string: Two objects are equal if the values at the given key paths + are equal. Example: + For 'key1.key2' the objects are equal if + obj1['key1']['key2'] == obj2['key1']['key2']. + list: Two objects are equal if all the values at the key paths + in the list are equal. Example: + For ['key1', 'key2.key3'] the objects are equal if + obj1['key1'] == obj2['key1'] and + obj1['key2']['key3'] == obj2['key2']['key3']. For normalizing the fields in the objects to be compared, one can add a normalization function for each field in the normalization_functions - dict. Example: + dict. + + Example: Setting the normalization_functions field to: - {'key1': str.lower} + ``{'key1': str.lower}`` would normalize obj1 = {'key1': 'ID123'} and obj2 = {'key1': 'id123'} to obj1 = {'key1': 'id123'} and obj2 = {'key1': 'id123'} @@ -101,10 +115,7 @@ def _have_field_equal(self, obj1, obj2, field): fn = self.normalization_functions.get(field, lambda x: x) return fn(o1) == fn(o2) - def equal(self, idx_l1, idx_l2): - obj1 = self.l1[idx_l1] - obj2 = self.l2[idx_l2] - + def equal(self, obj1, obj2): if obj1 == obj2: return True @@ -120,6 +131,7 @@ def equal(self, idx_l1, idx_l2): class DefaultComparator(BaseComparator): + """Two objects are the same entity if they are fully equal.""" - def equal(self, idx_l1, idx_l2): - return self.l1[idx_l1] == self.l2[idx_l2] + def equal(self, obj1, obj2): + return obj1 == obj2 diff --git a/json_merger/config.py b/json_merger/config.py new file mode 100644 index 0000000..024a8ba --- /dev/null +++ b/json_merger/config.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Inspirehep. +# Copyright (C) 2016 CERN. +# +# Inspirehep is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Inspirehep is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Inspirehep; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + + +class DictMergerOps(object): + """Possible strategies for merging two base values. + + Attributes: + FALLBACK_KEEP_HEAD: In case of conflict keep the `head` value. + + FALLBACK_KEEP_UPDATE: In case of conflict keep the `update` value. + """ + allowed_ops = [ + 'FALLBACK_KEEP_HEAD', + 'FALLBACK_KEEP_UPDATE' + ] + +for mode in DictMergerOps.allowed_ops: + setattr(DictMergerOps, mode, mode) + + +class UnifierOps(object): + """ + Attributes: + KEEP_ONLY_HEAD_ENTITIES: Merge entities in `update` with their match + in `head` having as a base the match in `root`. + + KEEP_ONLY_UPDATE_ENTITIES: Merge entities in 'head' with their match + in `update` having as a base the match in `root`. + + KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST: Perform an union of all + entities from `head` and `update` and merge the matching ones. + Also, preserve the order relations between the entities in both + lists. If two entities can have the same position first pick the + one that is present in the `head` object. + + KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST: Same behavior as + KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST but first pick the + `update` entities. + + KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE: If an entity was added + in the diff between the `root` and `head` lists but it's not + present in the `update` list then raise a conflict. + """ + allowed_ops = [ + 'KEEP_ONLY_HEAD_ENTITIES', + 'KEEP_ONLY_UPDATE_ENTITIES', + 'KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST', + 'KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST', + 'KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE', + ] + +for mode in UnifierOps.allowed_ops: + setattr(UnifierOps, mode, mode) diff --git a/json_merger/conflict.py b/json_merger/conflict.py index fa05f9e..eb5c200 100644 --- a/json_merger/conflict.py +++ b/json_merger/conflict.py @@ -29,6 +29,23 @@ class ConflictType(object): + """Types of Conflict. + + Attributes: + REORDER: The list specified by the path might need to be reordered. + + MANUAL_MERGE: The triple specified as the conflict body needs to be + manually merged and added to the conflict path. + + ADD_BACK_TO_HEAD: The object specified as the conflict body might + need to be added back to the list specified in the conflict's path. + + SET_FIELD: The object specified as the conflict body needs to be + added at the path specifed in the conflict object. + + REMOVE_FIELD: The value or object present at the path specified in + the path conflict needs to be removed. + """ pass _CONFLICTS = ( @@ -43,7 +60,20 @@ class ConflictType(object): class Conflict(tuple): - """Immutable representation of a conflict.""" + """Immutable and Hashable representation of a conflict. + + Attributes: + conflict_type: A :class:`json_merger.conflict.ConflictType` member. + + path: A tuple containing the path to the conflictual field. + + body: Optional value representing the body of the conflict. + + Note: + Even if the conflict body can be any arbitrary object, this is saved + internally as an immutable object so that a Conflict instance can be + safely used in sets or as a dict key. + """ # Based on http://stackoverflow.com/a/4828108 # Compatible with Python<=2.6 @@ -59,7 +89,9 @@ def __new__(cls, conflict_type, path, body): body = property(lambda self: thaw(self[2])) def with_prefix(self, root_path): + """Returns a new conflict with a prepended prefix as a path.""" return Conflict(self.conflict_type, root_path + self.path, self.body) def to_json(self): + """Deserializes conflict to a JSON object.""" return json.dumps([self.conflict_type, self.path, self.body]) diff --git a/json_merger/contrib/inspirehep/author_util.py b/json_merger/contrib/inspirehep/author_util.py index e821b79..a065c74 100644 --- a/json_merger/contrib/inspirehep/author_util.py +++ b/json_merger/contrib/inspirehep/author_util.py @@ -77,16 +77,6 @@ def simple_tokenize(name): 'nonlastnames': first_names} -class AuthorIdNormalizer(object): - """Callable that normalizes an author by extracting an UID field.""" - - def __init__(self, id_field): - self.id_field = id_field - - def __call__(self, author): - return author.get(self.id_field) - - class AuthorNameNormalizer(object): """Callable that normalizes an author name given a tokenizer function.""" @@ -95,16 +85,18 @@ def __init__(self, tokenize_function, first_name_to_initial=False): """Initialize the normalizer. - :param tokenize_function: - A function that receives an author name and parses it out in the - following format: - {'lastnames': NameToken instance list, - 'nonlastnames': NameToken instance list} - :param first_names_number: - Max number of first names too keep in the normalized name. - If None, keep all first names - :param first_name_to_initial: - If set to True, all first names will be transformed into initials. + Args: + tokenize_function: + A function that receives an author name and parses it out in + the following format: + {'lastnames': NameToken instance list, + 'nonlastnames': NameToken instance list} + first_names_number: + Max number of first names too keep in the normalized name. + If None, keep all first names + first_name_to_initial: + If set to True, all first names will be transformed into + initials. """ self.tokenize_function = tokenize_function @@ -125,35 +117,45 @@ def __call__(self, author): class AuthorNameDistanceCalculator(object): """Callable that calculates a distance between two author's names.""" - def __init__(self, tokenize_function, match_on_initial_penalization=0.05): + def __init__(self, tokenize_function, match_on_initial_penalization=0.05, + full_name_field='full_name'): """Initialize the distance calculator. - :param tokenize_function: - A function that receives an author name and parses it out in the - following format: - {'lastnames': NameToken instance list, - 'nonlastnames': NameToken instance list} - :param match_on_initial_penalization: - The cost value of a match between an initial and a full name - starting with the same letter. + Args: + tokenize_function: A function that receives an author name and + parses it out in the following format: + {'lastnames': NameToken instance list, + 'nonlastnames': NameToken instance list} + match_on_initial_penalization: + The cost value of a match between an initial and a full name + starting with the same letter. + name_field: + The field in which an author record keeps the full name. + Note: + The default match_on_initial_penalization had the best results + on a test suite based on production data. """ self.tokenize_function = tokenize_function self.match_on_initial_penalization = match_on_initial_penalization + self.name_field = full_name_field def __call__(self, author1, author2): - if 'full_name' not in author1: + # Return 1.0 on missing features. + if self.name_field not in author1: return 1.0 - if 'full_name' not in author2: + if self.name_field not in author2: return 1.0 - name_a1 = unidecode(author1['full_name']) - name_a2 = unidecode(author2['full_name']) + # Normalize to ASCII + name_a1 = unidecode(author1[self.name_field]) + name_a2 = unidecode(author2[self.name_field]) tokens_a1 = self.tokenize_function(name_a1) tokens_a2 = self.tokenize_function(name_a2) tokens_a1 = tokens_a1['lastnames'] + tokens_a1['nonlastnames'] tokens_a2 = tokens_a2['lastnames'] + tokens_a2['nonlastnames'] + # Match all names by editdistance. dist_matrix = [ [token_distance(t1, t2, self.match_on_initial_penalization) for t2 in tokens_a2] for t1 in tokens_a1] @@ -167,6 +169,8 @@ def __call__(self, author1, author2): if (not isinstance(tokens_a1[idx_a1], NameInitial) or not isinstance(tokens_a2[idx_a2], NameInitial)): matched_only_initials = False + + # Johnny, D will not be equal with Donny, J if matched_only_initials: return 1.0 diff --git a/json_merger/contrib/inspirehep/comparators.py b/json_merger/contrib/inspirehep/comparators.py index c4f254e..680c29f 100644 --- a/json_merger/contrib/inspirehep/comparators.py +++ b/json_merger/contrib/inspirehep/comparators.py @@ -26,9 +26,6 @@ from json_merger.comparator import BaseComparator -from .author_util import ( - simple_tokenize, AuthorIdNormalizer, AuthorNameDistanceCalculator, - AuthorNameNormalizer) from .match import distance_function_match @@ -41,22 +38,9 @@ def process_lists(self): if self.distance_function is None: raise NotImplementedError('You need to provide a distance ' 'function') + # Get the unbound version of the distance function. + dist_fn = self.__class__.__dict__['distance_function'] self.matches = set(distance_function_match(self.l1, self.l2, self.threshold, - self.distance_function, + dist_fn, self.norm_functions)) - - def equal(self, idx_l1, idx_l2): - return (idx_l1, idx_l2) in self.matches - - -class AuthorComparator(DistanceFunctionComparator): - norm_functions = [ - AuthorIdNormalizer('inspire_id'), - AuthorIdNormalizer('orcid'), - AuthorNameNormalizer(simple_tokenize), - AuthorNameNormalizer(simple_tokenize, 1), - AuthorNameNormalizer(simple_tokenize, 1, True) - ] - distance_function = AuthorNameDistanceCalculator(simple_tokenize) - threshold = 0.12 diff --git a/json_merger/dict_merger.py b/json_merger/dict_merger.py index 1b4db27..e6a1ca3 100644 --- a/json_merger/dict_merger.py +++ b/json_merger/dict_merger.py @@ -30,6 +30,7 @@ from dictdiffer import ADD, CHANGE, REMOVE, patch from dictdiffer.merge import Merger, UnresolvedConflictsException +from .config import DictMergerOps from .conflict import Conflict, ConflictType from .errors import MergeError from .nothing import NOTHING @@ -50,7 +51,7 @@ def _get_list_fields(obj, res, key_path=()): def patch_to_conflict_set(patch): - """Translate a dictdiffer conflict into a json_merger one.""" + """Translates a dictdiffer conflict into a json_merger one.""" patch_type, dotted_key, value = patch key_path = tuple(k for k in dotted_key.split('.') if k) @@ -71,19 +72,6 @@ def patch_to_conflict_set(patch): return conflicts -_OPERATIONS = [ - 'FALLBACK_KEEP_HEAD', - 'FALLBACK_KEEP_UPDATE', -] - - -class DictMergerOps(object): - pass - -for mode in _OPERATIONS: - setattr(DictMergerOps, mode, mode) - - class SkipListsMerger(object): """3-way Merger that ignores list fields.""" diff --git a/json_merger/errors.py b/json_merger/errors.py index 24872ec..a81ad47 100644 --- a/json_merger/errors.py +++ b/json_merger/errors.py @@ -26,8 +26,14 @@ class MergeError(Exception): + """Merging Error.""" def __init__(self, message, content): + """ + Attributes: + message: Error message. + content: List of conflicts that occured when merging. + """ super(MergeError, self).__init__(message) self.message = message self.content = content diff --git a/json_merger/graph_builder.py b/json_merger/graph_builder.py index 5f1df3a..6ea37f0 100644 --- a/json_merger/graph_builder.py +++ b/json_merger/graph_builder.py @@ -24,15 +24,19 @@ from __future__ import absolute_import, print_function +from collections import deque + import six from .comparator import DefaultComparator from .nothing import NOTHING +from .stats import ListMatchStats FIRST = 'first' class BeforeNodes(object): + """Edge in the match graph.""" def __init__(self, head_node=None, update_node=None): self.head_node = head_node @@ -43,64 +47,6 @@ def __repr__(self): self.head_node, self.update_node) -class ListMatchStats(object): - - def __init__(self, lst, root): - self.lst = lst - self.root = root - - self.in_result_idx = set() - self.not_in_result_idx = set(range(len(lst))) - self.not_in_result_root_match_idx = set() - self.root_matches = {} - - def move_to_result(self, lst_idx): - self.in_result_idx.add(lst_idx) - self.not_in_result_idx.remove(lst_idx) - - if lst_idx in self.not_in_result_root_match_idx: - self.not_in_result_root_match_idx.remove(lst_idx) - - def add_root_match(self, lst_idx, root_idx): - self.root_matches[lst_idx] = root_idx - if lst_idx in self.in_result_idx: - return - - self.not_in_result_root_match_idx.add(lst_idx) - - @property - def not_in_result_not_root_match_idx(self): - return self.not_in_result_idx.difference( - self.not_in_result_root_match_idx) - - @property - def in_result(self): - return [self.lst[e] for e in self.in_result_idx] - - @property - def not_in_result(self): - return [self.lst[e] for e in self.not_in_result_idx] - - @property - def not_in_result_root_match(self): - return [self.lst[e] for e in self.not_in_result_root_match_idx] - - @property - def not_in_result_not_root_match(self): - return [self.lst[e] for e in self.not_in_result_not_root_match_idx] - - @property - def not_in_result_root_match_pairs(self): - return [(self.lst[e], self.root[self.root_matches[e]]) - for e in self.not_in_result_root_match_idx] - - @property - def not_matched_root_objects(self): - matched_root_idx = set(self.root_matches.values()) - return [o for idx, o in enumerate(self.root) - if idx not in matched_root_idx] - - class ListMatchGraphBuilder(object): def __init__(self, root, head, update, sources, @@ -159,34 +105,35 @@ def _push_node(self, root_elem, head_elem, update_elem): if update_idx >= 0: self._update_idx_to_node[update_idx] = node_id - def _get_nodes(self, head_elem, update_elem): - """Get nodes to which either head_elem or update_elem point to.""" - head_idx, head_obj = head_elem - update_idx, update_obj = update_elem - res = set() - - if head_idx in self._head_idx_to_node: - res.add(self._head_idx_to_node[head_idx]) - if update_idx in self._update_idx_to_node: - res.add(self._update_idx_to_node[update_idx]) - - return res - - def _pop_node(self, node_id): - """Remove a node from the graph.""" - root_idx, head_idx, update_idx = self._node_src_indices[node_id] - del self._node_src_indices[node_id] - del self.node_data[node_id] - - if head_idx in self._head_idx_to_node: - del self._head_idx_to_node[head_idx] - if update_idx in self._update_idx_to_node: - del self._update_idx_to_node[update_idx] + def _get_matches(self, source, source_idx, source_obj): + other_two = {'head': ['root', 'update'], + 'update': ['root', 'head'], + 'root': ['head', 'update']} + indices = {'root': {}, 'head': {}, 'update': {}} + indices[source][source_idx] = source_obj + + # Start a BFS of matching elements. + q = deque([(source, source_idx)]) + while q: + curr_src, curr_idx = q.popleft() + for target in other_two[curr_src]: + comparator, cmp_list = self.comparators[(target, curr_src)] + # cmp_list is either 'l1' or 'l2' + # (the paremeter for the comparator class convetion) + matches = comparator.get_matches(cmp_list, curr_idx) + for target_idx, target_obj in matches: + if target_idx in indices[target]: + continue + indices[target][target_idx] = target_obj + q.append((target, target_idx)) + result = {} + for lst, res_indices in indices.items(): + if not res_indices: + result[lst] = [(-1, NOTHING)] + else: + result[lst] = sorted(res_indices.items()) - def _get_matches(self, target, source, source_idx): - comparator, src_list = self.comparators[(target, source)] - matches = comparator.get_matches(src_list, source_idx) - return matches if matches else [(-1, NOTHING)] + return result['root'], result['head'], result['update'] def _add_matches(self, root_elems, head_elems, update_elems): matches = [(r, h, u) @@ -194,69 +141,41 @@ def _add_matches(self, root_elems, head_elems, update_elems): for h in head_elems for u in update_elems] if len(matches) == 1: - root_elem, head_elem, update_elem = matches[0] - node_ids = self._get_nodes(head_elem, update_elem) - # If this single match overrides a previous node entry we remove - # add this match as a multiple_match_choice and mark the node - # for removal. We will later remove the node from the graph so - # that future collisions with this node will be caught. - if not node_ids: - self._push_node(root_elem, head_elem, update_elem) - else: - self._dirty_nodes.update(node_ids) - self.multiple_match_choice_idx.add( - (root_elem[0], head_elem[0], update_elem[0])) + self._push_node(*matches[0]) else: - match_indices = [(r[0], h[0], u[0]) for r, h, u in matches] - self.multiple_match_choice_idx.update(match_indices) + self.multiple_match_choice_idx.update([(r[0], h[0], u[0]) + for r, h, u in matches]) def _populate_nodes(self): - if 'head' in self.sources: - for head_idx, head_obj in enumerate(self.head): - head_elems = [(head_idx, head_obj)] - root_elems = self._get_matches('root', 'head', head_idx) - update_elems = self._get_matches('update', 'head', head_idx) - self._add_matches(root_elems, head_elems, update_elems) - - if 'update' in self.sources: - for update_idx, update_obj in enumerate(self.update): - # Already added this node in the graph, continue. - if update_idx in self._update_idx_to_node: - continue - - update_elems = [(update_idx, update_obj)] - root_elems = self._get_matches('root', 'update', update_idx) - head_elems = self._get_matches('head', 'update', update_idx) - self._add_matches(root_elems, head_elems, update_elems) - - for node_id in self._dirty_nodes: - self.multiple_match_choice_idx.add(self._node_src_indices[node_id]) - self._pop_node(node_id) - for r_idx, h_idx, u_idx in self.multiple_match_choice_idx: - r_obj = self.root[r_idx] if r_idx >= 0 else None - h_obj = self.head[h_idx] if h_idx >= 0 else None - u_obj = self.update[u_idx] if u_idx >= 0 else None - self.multiple_match_choices.append((r_obj, h_obj, u_obj)) - - def _build_stats(self): - for node_id, indices in self._node_src_indices.items(): - root_idx, head_idx, update_idx = indices - + for idx, obj in enumerate(self.head): + r_elems, h_elems, u_elems = self._get_matches('head', idx, obj) + if 'head' in self.sources: + self._add_matches(r_elems, h_elems, u_elems) + if len(r_elems) == 1 and r_elems[0][0] >= 0: + self.head_stats.add_root_match(idx, r_elems[0][0]) + + for idx, obj in enumerate(self.update): + r_elems, h_elems, u_elems = self._get_matches('update', idx, obj) + # Only add the node to the graph only if not already added. + if ('update' in self.sources and + idx not in self._update_idx_to_node): + self._add_matches(r_elems, h_elems, u_elems) + if len(r_elems) == 1 and r_elems[0][0] >= 0: + self.update_stats.add_root_match(idx, r_elems[0][0]) + + # Add stats from built nodes. + for root_idx, head_idx, update_idx in self._node_src_indices.values(): if head_idx >= 0: self.head_stats.move_to_result(head_idx) if update_idx >= 0: self.update_stats.move_to_result(update_idx) - for idx in range(len(self.head)): - matches = self._get_matches('root', 'head', idx) - # Matches[0][0] is the index in the root list of the first match. - if len(matches) == 1 and matches[0][0] >= 0: - self.head_stats.add_root_match(idx, matches[0][0]) - - for idx in range(len(self.update)): - matches = self._get_matches('root', 'update', idx) - if len(matches) == 1 and matches[0][0] >= 0: - self.update_stats.add_root_match(idx, matches[0][0]) + # Move the unique multiple match indices to conflicts. + for r_idx, h_idx, u_idx in self.multiple_match_choice_idx: + r_obj = self.root[r_idx] if r_idx >= 0 else None + h_obj = self.head[h_idx] if h_idx >= 0 else None + u_obj = self.update[u_idx] if u_idx >= 0 else None + self.multiple_match_choices.append((r_obj, h_obj, u_obj)) def _get_next_node(self, source, indices): if source not in self.sources: @@ -272,7 +191,6 @@ def _get_next_node(self, source, indices): def build_graph(self): self._populate_nodes() - self._build_stats() # Link a dummy first node before the first element of the sources # lists. diff --git a/json_merger/list_unify.py b/json_merger/list_unify.py index e5e4979..22f1d3d 100644 --- a/json_merger/list_unify.py +++ b/json_merger/list_unify.py @@ -27,27 +27,13 @@ from __future__ import absolute_import, print_function from .comparator import DefaultComparator +from .config import UnifierOps from .conflict import Conflict, ConflictType from .errors import MergeError from .graph_builder import ( ListMatchGraphBuilder, sort_cyclic_graph_best_effort, toposort ) -_OPERATIONS = [ - 'KEEP_ONLY_HEAD_ENTITIES', - 'KEEP_ONLY_UPDATE_ENTITIES', - 'KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST', - 'KEEP_UPDATE_AND_HEAD_ENTITIES_UPDATE_FIRST', - 'KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE', -] - - -class UnifierOps(object): - pass - -for mode in _OPERATIONS: - setattr(UnifierOps, mode, mode) - _SOURCES = { UnifierOps.KEEP_ONLY_UPDATE_ENTITIES: ['update'], UnifierOps.KEEP_ONLY_HEAD_ENTITIES: ['head'], @@ -73,7 +59,7 @@ class ListUnifier(object): def __init__(self, root, head, update, operation, comparator_cls=DefaultComparator): - if operation not in _OPERATIONS: + if operation not in UnifierOps.allowed_ops: raise ValueError('Operation %r not permitted' % operation) self.root = root diff --git a/json_merger/merger.py b/json_merger/merger.py index d65fe98..57bd3bc 100644 --- a/json_merger/merger.py +++ b/json_merger/merger.py @@ -22,29 +22,86 @@ # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. -"""Module that is able to merge JSON record objects.""" +"""Definition for JSON merger class.""" from __future__ import absolute_import, print_function import copy from .comparator import DefaultComparator -from .dict_merger import DictMergerOps, SkipListsMerger +from .dict_merger import SkipListsMerger from .errors import MergeError -from .list_unify import ListUnifier, UnifierOps +from .list_unify import ListUnifier from .utils import ( get_conf_set_for_key_path, get_dotted_key_path, get_obj_at_key_path, set_obj_at_key_path ) -PLACEHOLDER_STR = "#$PLACEHOLDER$#" +PLACEHOLDER_STR = '#$PLACEHOLDER$#' -class ListAlignMerger(object): +class Merger(object): + """Class that merges two JSON objects that share a common ancestor. + + This class treats by default all lists as being lists of entities and + offers support for matching their elements by their content, by specifing + per-field comparator classes. + """ def __init__(self, root, head, update, default_dict_merge_op, default_list_merge_op, list_merge_ops=None, comparators=None, data_lists=None): + """ + Args: + root: A common ancestor of the two objects being merged. + + head: One of the objects that is being merged. Refers to the + version that is currently in use. (e.g. a displayed database + record) + + update: The second object that is being merged. Refers to an update + that needs to be integrated with the in-use version. + + default_dict_merge_op + (:class:`json_merger.config.DictMergerOps` class attribute): + Default strategy for merging regular non list JSON values + (strings, numbers, other objects). + + default_list_merge_op + (:class:`json_merger.config.UnifierOps` class attribute): + Default strategy for merging two lists of entities. + + list_merge_ops: Defines custom strategies for merging lists of + entities. + + Dict formatted as: + * keys -- a config string + * values -- a class attribute of + :class:`json_merger.config.UnifierOps` + + comparators: Defines classes used for rendering entities in list + fields as equal. + + Dict formatted as: + * keys -- a config string + * values -- a class that extends + :class:`json_merger.comparator.BaseComparator` + + data_lists: List of config strings defining the lists that are not + treated as lists of entities. + + Note: + A configuration string represents the path towards a list field in + the object sepparated with dots. + + Example: + Configuration strings can be: + + For ``{'lst': [{'id': 0, 'tags': ['foo', 'bar']}]}``: + + * the config string for the top level list is ``'lst'`` + * the config string for the tags lists is ``'lst.tags'`` + """ self.comparators = comparators or {} self.data_lists = set(data_lists or []) self.list_merge_ops = list_merge_ops or {} @@ -66,74 +123,112 @@ def __init__(self, root, head, update, self.aligned_update = copy.deepcopy(update) def merge(self): - self.merged_root = self._recursive_merge(self.root, self.head, - self.update) - if self.conflicts: - raise MergeError('Conflicts Occured in Merge Process', - self.conflicts) + """Populates result members. - def _merge_objects(self, root, head, update, key_path): - data_lists = get_conf_set_for_key_path(self.data_lists, key_path) - object_merger = SkipListsMerger(root, head, update, - self.default_dict_merge_op, data_lists) - try: - object_merger.merge() - except MergeError as e: - self.conflicts.extend(c.with_prefix(key_path) for c in e.content) - return object_merger + Performs the merge algorithm using the specified config and fills in + the members that provide stats about the merging procedure. - def _build_aligned_lists_and_stats(self, list_unifier, key_path): - root_list = [] - head_list = [] - update_list = [] - for root_obj, head_obj, update_obj in list_unifier.unified: - # Cast NOTHING objects to a placeholder so we reserialize back to - # JSON if needed. - root_list.append(root_obj or PLACEHOLDER_STR) - head_list.append(head_obj or PLACEHOLDER_STR) - update_list.append(update_obj or PLACEHOLDER_STR) + Attributes: + merged_root: The result of the merge. - # If we can't set that key path a list to be merged wasn't there - # In the first place. - self.aligned_root = set_obj_at_key_path(self.aligned_root, - key_path, root_list, False) - self.aligned_head = set_obj_at_key_path(self.aligned_head, - key_path, head_list, False) - self.aligned_update = set_obj_at_key_path(self.aligned_update, - key_path, update_list, False) - self.head_stats[key_path] = list_unifier.head_stats - self.update_stats[key_path] = list_unifier.update_stats + aligned_root, aligned_head, aligned_update: Copies of root, head + and update in which all matched list entities have the same + list index for easier diff viewing. - def _unify_lists(self, root, head, update, key_path): - dotted_key_path = get_dotted_key_path(key_path, True) + head_stats, update_stats: Stats for each list field present in the + head or update objects. Instance of + :class:`json_merger.stats.ListMatchStats` - operation = self.list_merge_ops.get(dotted_key_path, - self.default_list_merge_op) - comparator_cls = self.comparators.get(dotted_key_path, - DefaultComparator) - list_unifier = ListUnifier(root, head, update, - operation, comparator_cls) - try: - list_unifier.unify() - except MergeError as e: - self.conflicts.extend(c.with_prefix(key_path) for c in e.content) + conflicts: List of :class:`json_merger.conflict.Conflict` instances + that occured during the merge. - return list_unifier + Raises: + :class:`json_merger.errors.MergeError` : If conflicts occur during + the call. + + Example: + >>> from json_merger import Merger + >>> # We compare people by their name + >>> from json_merger.comparator import PrimaryKeyComparator + >>> from json_merger.config import DictMergerOps, UnifierOps + >>> from json_merger.errors import MergeError + >>> # Use this only for doctest :) + >>> from pprint import pprint as pp + >>> + >>> root = {'people': [{'name': 'Jimmy', 'age': 30}]} + >>> head = {'people': [{'name': 'Jimmy', 'age': 31}, + ... {'name': 'George'}]} + >>> update = {'people': [{'name': 'John'}, + ... {'name': 'Jimmy', 'age': 32}]} + >>> + >>> class NameComparator(PrimaryKeyComparator): + ... # Two objects are the same entitity if they have the + ... # same name. + ... primary_key_fields = ['name'] + >>> m = Merger(root, head, update, + ... DictMergerOps.FALLBACK_KEEP_HEAD, + ... UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST, + ... comparators = {'people': NameComparator}) + >>> # We do a merge + >>> try: + ... m.merge() + ... except MergeError as e: + ... # Conflicts are the same thing as the exception content. + ... assert e.content == m.conflicts + >>> # This is how the lists are aligned: + >>> pp(m.aligned_root['people'], width=60) + ['#$PLACEHOLDER$#', + {'age': 30, 'name': 'Jimmy'}, + '#$PLACEHOLDER$#'] + >>> pp(m.aligned_head['people'], width=60) + ['#$PLACEHOLDER$#', + {'age': 31, 'name': 'Jimmy'}, + {'name': 'George'}] + >>> pp(m.aligned_update['people'], width=60) + [{'name': 'John'}, + {'age': 32, 'name': 'Jimmy'}, + '#$PLACEHOLDER$#'] + >>> # This is the end result of the merge: + >>> pp(m.merged_root, width=60) + {'people': [{'name': 'John'}, + {'age': 31, 'name': 'Jimmy'}, + {'name': 'George'}]} + >>> # With some conflicts: + >>> pp(m.conflicts, width=60) + [('SET_FIELD', ('people', 1, 'age'), 32)] + >>> # And some stats: + >>> pp(m.head_stats[('people',)].in_result) + [{'age': 31, 'name': 'Jimmy'}, {'name': 'George'}] + >>> pp(m.update_stats[('people',)].not_in_result) + [] + + Note: + Even if conflicts occur, merged_root, aligned_root, aligned_head + and aligned_update are always populated by following the + startegies set for the merger instance. + """ + self.merged_root = self._recursive_merge(self.root, self.head, + self.update) + if self.conflicts: + raise MergeError('Conflicts Occurred in Merge Process', + self.conflicts) def _recursive_merge(self, root, head, update, key_path=()): - dotted_key_path = get_dotted_key_path(key_path, True) + dotted_key_path = get_dotted_key_path(key_path, filter_int_keys=True) + if (isinstance(head, list) and isinstance(update, list) and dotted_key_path not in self.data_lists): - # We are aligning bare lists so the key path is an empty tuple. - lists = [()] + # In this case we are merging two lists of objects. + lists_to_unify = [()] if not isinstance(root, list): root = [] else: + # Otherwise we merge everything but the lists using DictMergerOps. m = self._merge_objects(root, head, update, key_path) root = m.merged_root - lists = m.skipped_lists + lists_to_unify = m.skipped_lists - for list_field in lists: + for list_field in lists_to_unify: absolute_key_path = key_path + list_field root_l = get_obj_at_key_path(root, list_field, []) @@ -155,13 +250,51 @@ def _recursive_merge(self, root, head, update, key_path=()): return root + def _merge_objects(self, root, head, update, key_path): + data_lists = get_conf_set_for_key_path(self.data_lists, key_path) + object_merger = SkipListsMerger(root, head, update, + self.default_dict_merge_op, data_lists) + try: + object_merger.merge() + except MergeError as e: + self.conflicts.extend(c.with_prefix(key_path) for c in e.content) + return object_merger -class UpdateMerger(ListAlignMerger): + def _unify_lists(self, root, head, update, key_path): + dotted_key_path = get_dotted_key_path(key_path, True) - def __init__(self, root, head, update, - list_merge_ops=None, comparators=None, data_lists=None): - super(UpdateMerger, self).__init__( - root, head, update, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES, - list_merge_ops, comparators, data_lists) + operation = self.list_merge_ops.get(dotted_key_path, + self.default_list_merge_op) + comparator_cls = self.comparators.get(dotted_key_path, + DefaultComparator) + list_unifier = ListUnifier(root, head, update, + operation, comparator_cls) + try: + list_unifier.unify() + except MergeError as e: + self.conflicts.extend(c.with_prefix(key_path) for c in e.content) + + return list_unifier + + def _build_aligned_lists_and_stats(self, list_unifier, key_path): + root_list = [] + head_list = [] + update_list = [] + for root_obj, head_obj, update_obj in list_unifier.unified: + # Cast NOTHING objects to a placeholder so we reserialize back to + # JSON if needed. + root_list.append(root_obj or PLACEHOLDER_STR) + head_list.append(head_obj or PLACEHOLDER_STR) + update_list.append(update_obj or PLACEHOLDER_STR) + + # Try to put back the list if the key path existed in the first place. + self.aligned_root = set_obj_at_key_path(self.aligned_root, + key_path, root_list, False) + self.aligned_head = set_obj_at_key_path(self.aligned_head, + key_path, head_list, False) + self.aligned_update = set_obj_at_key_path(self.aligned_update, + key_path, update_list, False) + + # Also copy over the stats. + self.head_stats[key_path] = list_unifier.head_stats + self.update_stats[key_path] = list_unifier.update_stats diff --git a/json_merger/stats.py b/json_merger/stats.py new file mode 100644 index 0000000..a0e930d --- /dev/null +++ b/json_merger/stats.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Inspirehep. +# Copyright (C) 2016 CERN. +# +# Inspirehep is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Inspirehep is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Inspirehep; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +from __future__ import absolute_import, print_function + + +class ListMatchStats(object): + """Class for holding list entity matching stats.""" + + def __init__(self, lst, root): + """ + Args: + lst: The list of elements that needs to be matched. + root: The ancestor of the list of elements that needs to be + matched. + + Attributes: + in_result_idx: Indices of elements in lst that are present in the + end result. + + in_result: Elements in lst that are present in the end result. + + not_in_result_idx: Indices of elements in lst that are not present + in the end result. + + not_in_result: Elements in lst that are not present in the end + result. + + not_in_result_root_match_idx: Indices of elements that are not in + the end result but were matched with root elements. + + not_in_result_root_match: Elements that are not in the end result + but were matched with root elements. + + not_in_result_not_root_match_idx: Indices of elements that are not + in the end result and were not matched with any root elements. + + not_in_result_not_root_match: Elements that are not in the end + result and were not matched with any root elements. + + not_in_result_root_match_pairs: Pairs of (lst, root) elements + that are not in the end result but were matched. + """ + self.lst = lst + self.root = root + + self.in_result_idx = set() + self.not_in_result_root_match_idx = set() + self.root_matches = {} + + def move_to_result(self, lst_idx): + """Moves element from lst available at lst_idx.""" + self.in_result_idx.add(lst_idx) + + if lst_idx in self.not_in_result_root_match_idx: + self.not_in_result_root_match_idx.remove(lst_idx) + + def add_root_match(self, lst_idx, root_idx): + """Adds a match for the elements avaialble at lst_idx and root_idx.""" + self.root_matches[lst_idx] = root_idx + if lst_idx in self.in_result_idx: + return + + self.not_in_result_root_match_idx.add(lst_idx) + + @property + def not_in_result_idx(self): + return set(range(len(self.lst))).difference(self.in_result_idx) + + @property + def not_in_result_not_root_match_idx(self): + return self.not_in_result_idx.difference( + self.not_in_result_root_match_idx) + + @property + def in_result(self): + return [self.lst[e] for e in self.in_result_idx] + + @property + def not_in_result(self): + return [self.lst[e] for e in self.not_in_result_idx] + + @property + def not_in_result_root_match(self): + return [self.lst[e] for e in self.not_in_result_root_match_idx] + + @property + def not_in_result_not_root_match(self): + return [self.lst[e] for e in self.not_in_result_not_root_match_idx] + + @property + def not_in_result_root_match_pairs(self): + return [(self.lst[e], self.root[self.root_matches[e]]) + for e in self.not_in_result_root_match_idx] + + @property + def not_matched_root_objects(self): + matched_root_idx = set(self.root_matches.values()) + return [o for idx, o in enumerate(self.root) + if idx not in matched_root_idx] diff --git a/run-tests.sh b/run-tests.sh index f0ecc42..243bdef 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -23,9 +23,9 @@ # as an Intergovernmental Organization or submit itself to any jurisdiction. -#pydocstyle json_merger && \ isort -rc -c -df -m5 **/*.py && \ check-manifest --ignore ".travis-*" && \ +rm -rf docs/_build/ && \ sphinx-build -qnNW docs docs/_build/html && \ python setup.py test && \ sphinx-build -qnNW -b doctest docs docs/_build/doctest diff --git a/setup.py b/setup.py index 8d5b990..4637836 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. -"""Invenio module that is able to merge json record objects.""" +"""Python module that is able to merge json record objects.""" import os import sys diff --git a/tests/acceptance/test_merger_update.py b/tests/acceptance/test_merger_update.py index 0afc64e..e0a4128 100644 --- a/tests/acceptance/test_merger_update.py +++ b/tests/acceptance/test_merger_update.py @@ -29,8 +29,13 @@ import pytest -from json_merger import UnifierOps, UpdateMerger, MergeError -from json_merger.contrib.inspirehep.comparators import AuthorComparator +from json_merger import Merger +from json_merger.config import DictMergerOps, UnifierOps +from json_merger.errors import MergeError +from json_merger.contrib.inspirehep.comparators import ( + DistanceFunctionComparator) +from json_merger.contrib.inspirehep.author_util import ( + simple_tokenize, AuthorNameDistanceCalculator, AuthorNameNormalizer) from json_merger.comparator import PrimaryKeyComparator from json_merger.conflict import Conflict, ConflictType @@ -46,11 +51,25 @@ class AffiliationComparator(PrimaryKeyComparator): primary_key_fields = ['value'] +class AuthorComparator(DistanceFunctionComparator): + norm_functions = [ + # Better hints can be given by normalizing by primary key, + # (e.g. recid, orcid, ...) + # but this type of normalization is not implemented in contrib. + AuthorNameNormalizer(simple_tokenize), + AuthorNameNormalizer(simple_tokenize, 1), + AuthorNameNormalizer(simple_tokenize, 1, True) + ] + distance_function = AuthorNameDistanceCalculator(simple_tokenize) + threshold = 0.12 + + COMPARATORS = { 'authors': AuthorComparator, 'authors.affiliations': AffiliationComparator, 'titles': TitleComparator } + LIST_MERGE_OPS = { 'titles': UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST, 'authors.affiliations': UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST @@ -82,9 +101,11 @@ def _deserialize_conflict(conflict_type, path, body): 'title_change']) def test_author_typo_scenarios(update_fixture_loader, scenario): root, head, update, exp, desc = update_fixture_loader.load_test(scenario) - merger = UpdateMerger(root, head, update, - comparators=COMPARATORS, - list_merge_ops=LIST_MERGE_OPS) + merger = Merger(root, head, update, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES, + comparators=COMPARATORS, + list_merge_ops=LIST_MERGE_OPS) if exp.get('conflicts'): with pytest.raises(MergeError) as excinfo: merger.merge() diff --git a/tests/fixtures/enrich_scenarios/description.txt b/tests/fixtures/enrich_scenarios/description.txt deleted file mode 100644 index 2e92fe6..0000000 --- a/tests/fixtures/enrich_scenarios/description.txt +++ /dev/null @@ -1,15 +0,0 @@ -In this suite we test the following workflow: -* A record harvested from source "A" is inserted into the database. -* The database record is annotaded via external sources (e.g. Catalogers). -* We receive an update from source "B" and we only want to add extra fields - while not losing any of the data added by "A" as source of truth. - -The expected behavior is: -* Record order in list fields is given the source of truth record. - Update won't perform any: - * Additions - * Reorderings - * _Deletions_ -* Any field present in "A" will be locked to the value added in "A". -* Any field addition added in "B" will be present in the end result. -* Any field change that is not present in "A" will be updated by the "B" update. diff --git a/tests/unit/test_comparator.py b/tests/unit/test_comparator.py index 263bfb0..9478d6b 100644 --- a/tests/unit/test_comparator.py +++ b/tests/unit/test_comparator.py @@ -37,12 +37,9 @@ class MyComp(PrimaryKeyComparator): lst = [{'id': 0}, {'id': 1}, {'f': {'id': 0}}, {'f': {'id': 1}}] inst = MyComp(lst, lst) - for idx1 in range(len(lst)): - for idx2 in range(len(lst)): - if idx1 == idx2: - assert inst.equal(idx1, idx2) - else: - assert not inst.equal(idx1, idx2) + for i, obj in enumerate(lst): + assert inst.get_matches('l1', i) == [(i, obj)] + assert inst.get_matches('l2', i) == [(i, obj)] def test_list_of_primary_keys(): @@ -59,11 +56,14 @@ class MyComp(PrimaryKeyComparator): inst = MyComp(lst1, lst2) - assert not inst.equal(0, 0) - assert not inst.equal(1, 1) - assert not inst.equal(2, 3) + assert not inst.get_matches('l1', 0) + assert not inst.get_matches('l1', 1) + assert not inst.get_matches('l2', 0) + assert not inst.get_matches('l2', 1) + assert not inst.get_matches('l2', 3) - assert inst.equal(2, 2) + assert inst.get_matches('l1', 2) == [(2, lst2[2])] + assert inst.get_matches('l2', 2) == [(2, lst1[2])] def test_list_of_primary_keys_normalization(): @@ -81,8 +81,11 @@ class MyComp(PrimaryKeyComparator): inst = MyComp(lst1, lst2) - assert not inst.equal(0, 0) - assert not inst.equal(1, 1) - assert not inst.equal(2, 3) + assert not inst.get_matches('l1', 0) + assert not inst.get_matches('l1', 1) + assert not inst.get_matches('l2', 0) + assert not inst.get_matches('l2', 1) + assert not inst.get_matches('l2', 3) - assert inst.equal(2, 2) + assert inst.get_matches('l1', 2) == [(2, lst2[2])] + assert inst.get_matches('l2', 2) == [(2, lst1[2])] diff --git a/tests/unit/test_dict_merger.py b/tests/unit/test_dict_merger.py index 3e1727a..a4d5280 100644 --- a/tests/unit/test_dict_merger.py +++ b/tests/unit/test_dict_merger.py @@ -28,8 +28,9 @@ import pytest +from json_merger.config import DictMergerOps from json_merger.conflict import Conflict, ConflictType -from json_merger.dict_merger import DictMergerOps, SkipListsMerger +from json_merger.dict_merger import SkipListsMerger from json_merger.errors import MergeError from json_merger.nothing import NOTHING diff --git a/tests/unit/test_list_align.py b/tests/unit/test_list_align.py index 4909701..edcabc7 100644 --- a/tests/unit/test_list_align.py +++ b/tests/unit/test_list_align.py @@ -29,9 +29,11 @@ import pytest +from json_merger.config import UnifierOps from json_merger.conflict import ConflictType +from json_merger.comparator import PrimaryKeyComparator from json_merger.errors import MergeError -from json_merger.list_unify import ListUnifier, UnifierOps +from json_merger.list_unify import ListUnifier from json_merger.nothing import NOTHING @@ -223,3 +225,27 @@ def test_stats(): assert sorted(u.update_stats.not_in_result_root_match) == [] assert sorted(u.update_stats.not_in_result_not_root_match) == [] assert sorted(u.update_stats.not_matched_root_objects) == [2, 10] + + +def test_transitive_equality(): + class Comp(PrimaryKeyComparator): + primary_key_fields = ['id0', 'id1'] + + only0 = {'id0': 0} + only1 = {'id1': 1} + both = {'id0': 0, 'id1': 1} + + u = ListUnifier([only0], [both], [only1], + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES, Comp) + u.unify() + assert u.unified == [(only0, both, only1)] + + u = ListUnifier([only0], [only1], [both], + UnifierOps.KEEP_ONLY_HEAD_ENTITIES, Comp) + u.unify() + assert u.unified == [(only0, only1, both)] + + u = ListUnifier([only0], [only1], [both], + UnifierOps.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST, Comp) + u.unify() + assert u.unified == [(only0, only1, both)] diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py index cb346d4..1012134 100644 --- a/tests/unit/test_merger.py +++ b/tests/unit/test_merger.py @@ -31,11 +31,10 @@ import pytest +from json_merger.config import DictMergerOps, UnifierOps from json_merger.conflict import Conflict, ConflictType -from json_merger.dict_merger import DictMergerOps from json_merger.errors import MergeError -from json_merger.list_unify import UnifierOps -from json_merger.merger import ListAlignMerger +from json_merger.merger import Merger def test_merge_bare_int_lists(): @@ -43,9 +42,9 @@ def test_merge_bare_int_lists(): h = [1, 2, 3, 4] u = [1, 2, 5] - m = ListAlignMerger(r, h, u, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) + m = Merger(r, h, u, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) m.merge() assert m.merged_root == [1, 2, 5] @@ -55,9 +54,9 @@ def test_merge_bare_str_lists(): h = ['1', '2', '3', '4'] u = ['1', '2', '5'] - m = ListAlignMerger(r, h, u, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) + m = Merger(r, h, u, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) m.merge() assert m.merged_root == ['1', '2', '5'] @@ -67,9 +66,9 @@ def test_merge_nested_lists(): h = [[1], [2], [3], [4]] u = [[1], [2], [5]] - m = ListAlignMerger(r, h, u, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) + m = Merger(r, h, u, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) m.merge() assert m.merged_root == [[1], [2], [5]] @@ -80,9 +79,9 @@ def test_merge_root_is_not_list(): h = [[1], [2, 3], [5]] u = [[1], [2, 3], [5]] - m = ListAlignMerger(r, h, u, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) + m = Merger(r, h, u, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) m.merge() # Here the lists are aligned as entities and lists of entities. assert m.merged_root == [[1], [2, 3], [5]] @@ -93,9 +92,9 @@ def test_merge_list_with_string(): h = [1, 2, 3] u = 'a given string' - m = ListAlignMerger(r, h, u, - DictMergerOps.FALLBACK_KEEP_HEAD, - UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) + m = Merger(r, h, u, + DictMergerOps.FALLBACK_KEEP_HEAD, + UnifierOps.KEEP_ONLY_UPDATE_ENTITIES) with pytest.raises(MergeError) as excinfo: m.merge()