Skip to content

Commit

Permalink
global: add pre-commit with ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
PascalEgn committed Aug 6, 2024
1 parent 0f8ca9f commit fce85a7
Show file tree
Hide file tree
Showing 26 changed files with 904 additions and 666 deletions.
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[run]
relative_files = True
relative_files = True
1 change: 0 additions & 1 deletion .github/workflows/push-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ jobs:
needs: [python2_tests, python3_tests]
uses: ./.github/workflows/publish.yml
secrets: inherit

4 changes: 2 additions & 2 deletions .github/workflows/test-python-2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ jobs:
run: |
${{ matrix.python }} --version
${{ matrix.pip }} freeze
- name: Run flake8 checks
shell: bash
run: flake8 inspire_json_merger tests

- name: Run tests
run: |
./run-tests.sh
2 changes: 1 addition & 1 deletion .github/workflows/test-python-3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Run flake8 checks
shell: bash
run: flake8 inspire_json_merger tests

- name: Run tests
run: |
./run-tests.sh
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: fix-byte-order-marker
- id: mixed-line-ending
- id: name-tests-test
args: [ --pytest-test-first ]
exclude: '^(?!factories/)'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.6
hooks:
- id: ruff
args: [ --fix ]
52 changes: 38 additions & 14 deletions inspire_json_merger/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

from __future__ import absolute_import, division, print_function

from inspire_utils.record import get_value
from inspire_utils.helpers import force_list
from inspire_utils.record import get_value
from json_merger.merger import MergeError, Merger

from inspire_json_merger.config import (
Expand All @@ -35,7 +35,6 @@
PublisherOnPublisherOperations,
)
from inspire_json_merger.postprocess import postprocess_results

from inspire_json_merger.utils import filter_conflicts, filter_records


Expand All @@ -61,9 +60,13 @@ def merge(root, head, update, head_source=None, configuration=None):
if not configuration:
configuration = get_configuration(head, update, head_source)
conflicts = []
root, head, update = filter_records(root, head, update, filters=configuration.pre_filters)
root, head, update = filter_records(
root, head, update, filters=configuration.pre_filters
)
merger = Merger(
root=root, head=head, update=update,
root=root,
head=head,
update=update,
default_dict_merge_op=configuration.default_dict_merge_op,
default_list_merge_op=configuration.default_list_merge_op,
list_dict_ops=configuration.list_dict_ops,
Expand Down Expand Up @@ -95,7 +98,7 @@ def get_configuration(head, update, head_source=None):
MergerConfigurationOperations: an object containing
the rules needed to merge HEAD and UPDATE
"""
head_source = (head_source or get_head_source(head))
head_source = head_source or get_head_source(head)
update_source = get_acquisition_source(update)

if is_manual_merge(head, update):
Expand All @@ -118,12 +121,15 @@ def get_configuration(head, update, head_source=None):

def get_head_source(json_obj):
def no_freetext_in_publication_info(obj):
return 'publication_info' in obj and \
any('pubinfo_freetext' not in pubinfo for pubinfo in obj.get('publication_info'))
return 'publication_info' in obj and any(
'pubinfo_freetext' not in pubinfo for pubinfo in obj.get('publication_info')
)

def no_arxiv_in_dois(obj):
return 'dois' in obj and \
any(source.lower() != 'arxiv' for source in force_list(get_value(obj, 'dois.source')))
return 'dois' in obj and any(
source.lower() != 'arxiv'
for source in force_list(get_value(obj, 'dois.source'))
)

if no_freetext_in_publication_info(json_obj) or no_arxiv_in_dois(json_obj):
return 'publisher'
Expand All @@ -141,15 +147,33 @@ def get_acquisition_source(json_obj):


def is_manual_merge(head, update):
return ('control_number' in update and 'control_number' in head and update['control_number'] != head['control_number'])
return (
'control_number' in update
and 'control_number' in head
and update['control_number'] != head['control_number']
)


def is_erratum(update):
erratum_keywords = {"erratum", "corrigendum", "publisher's note", "publisher correction"}
erratum_keywords = {
"erratum",
"corrigendum",
"publisher's note",
"publisher correction",
}
journal_titles_list = get_value(update, "titles.title", [])
journal_titles_string = " ".join(journal_titles_list).lower()
title_contains_erratum_keyword = any([keyword in journal_titles_string for keyword in erratum_keywords])
title_starts_with_correction_to = any(journal_title.lower().startswith('correction to:') for journal_title in journal_titles_list)
title_contains_erratum_keyword = any(
[keyword in journal_titles_string for keyword in erratum_keywords]
)
title_starts_with_correction_to = any(
journal_title.lower().startswith('correction to:')
for journal_title in journal_titles_list
)
erratum_in_dois_material = 'erratum' in get_value(update, "dois.material", [])
if title_contains_erratum_keyword or title_starts_with_correction_to or erratum_in_dois_material:
if (
title_contains_erratum_keyword
or title_starts_with_correction_to
or erratum_in_dois_material
):
return True
46 changes: 26 additions & 20 deletions inspire_json_merger/comparators.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
NameInitial,
NameToken,
)
from json_merger.contrib.inspirehep.comparators import \
DistanceFunctionComparator
from json_merger.contrib.inspirehep.comparators import DistanceFunctionComparator

from inspire_json_merger.utils import scan_author_string_for_phrases

Expand All @@ -53,6 +52,7 @@ def author_tokenize(name):

class IDNormalizer(object):
"""Callable that can be used to normalize by a given id for authors."""

def __init__(self, id_type):
self.id_type = id_type

Expand All @@ -75,19 +75,25 @@ class AuthorComparator(DistanceFunctionComparator):
AuthorNameNormalizer(author_tokenize, asciify=True),
AuthorNameNormalizer(author_tokenize, first_names_number=1),
AuthorNameNormalizer(author_tokenize, first_names_number=1, asciify=True),
AuthorNameNormalizer(author_tokenize, first_names_number=1, first_name_to_initial=True),
AuthorNameNormalizer(author_tokenize, first_names_number=1, first_name_to_initial=True, asciify=True),
AuthorNameNormalizer(
author_tokenize, first_names_number=1, first_name_to_initial=True
),
AuthorNameNormalizer(
author_tokenize,
first_names_number=1,
first_name_to_initial=True,
asciify=True,
),
]


def get_pk_comparator(primary_key_fields, normalization_functions=None):
class Ret(PrimaryKeyComparator):
__doc__ = (
'primary_key_fields:%s, normalization_functions:%s' % (
primary_key_fields,
normalization_functions,
)
__doc__ = 'primary_key_fields:%s, normalization_functions:%s' % (
primary_key_fields,
normalization_functions,
)

Ret.primary_key_fields = primary_key_fields
Ret.normalization_functions = normalization_functions or {}
return Ret
Expand All @@ -112,19 +118,19 @@ class Ret(PrimaryKeyComparator):
SchemaValueComparator = get_pk_comparator([['schema', 'value']])


PublicationInfoComparator = get_pk_comparator([
['journal_title', 'journal_volume', 'material', 'cnum']
])
PublicationInfoComparator = get_pk_comparator(
[['journal_title', 'journal_volume', 'material', 'cnum']]
)

FigureComparator = get_pk_comparator([
['key', 'material']
])
FigureComparator = get_pk_comparator([['key', 'material']])

DocumentComparator = get_pk_comparator([
['source', 'description', 'material'],
['source', 'fulltext', 'material'],
['source', 'original_url', 'material'],
])
DocumentComparator = get_pk_comparator(
[
['source', 'description', 'material'],
['source', 'fulltext', 'material'],
['source', 'original_url', 'material'],
]
)

PersistentIdentifierComparator = get_pk_comparator(['value', 'material'])

Expand Down
42 changes: 18 additions & 24 deletions inspire_json_merger/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,21 @@

from __future__ import absolute_import, division, print_function

from json_merger.config import DictMergerOps as D, UnifierOps as U
from json_merger.config import DictMergerOps as D
from json_merger.config import UnifierOps as U

from inspire_json_merger.comparators import COMPARATORS, GROBID_ON_ARXIV_COMPARATORS
from inspire_json_merger.pre_filters import (
clean_root_for_acquisition_source,
filter_curated_references,
filter_documents_same_source,
filter_figures_same_source,
filter_curated_references,
filter_publisher_references,
update_authors_with_ordering_info,
remove_references_from_update,
clean_root_for_acquisition_source,
remove_root,
update_authors_with_ordering_info,
update_material,
remove_root
)
from .comparators import COMPARATORS, GROBID_ON_ARXIV_COMPARATORS

"""
This module provides different sets of rules that `inspire_json_merge`
Expand All @@ -58,7 +59,7 @@ class ArxivOnArxivOperations(MergerConfigurationOperations):
filter_documents_same_source,
filter_figures_same_source,
filter_curated_references,
update_authors_with_ordering_info
update_authors_with_ordering_info,
]
conflict_filters = [
'_collections',
Expand Down Expand Up @@ -104,7 +105,8 @@ class ArxivOnArxivOperations(MergerConfigurationOperations):
'persistent_identifiers': D.FALLBACK_KEEP_HEAD,
'preprint_date': D.FALLBACK_KEEP_HEAD,
'publication_info': D.FALLBACK_KEEP_HEAD,
# Fields bellow are merged and conflicts are ignored, so for those fields we stay with previous merge behaviour.
# Fields bellow are merged and conflicts are ignored,
# so for those fields we stay with previous merge behaviour.
'abstracts': D.FALLBACK_KEEP_UPDATE,
'acquisition_source': D.FALLBACK_KEEP_UPDATE,
'arxiv_eprints': D.FALLBACK_KEEP_UPDATE,
Expand All @@ -115,7 +117,6 @@ class ArxivOnArxivOperations(MergerConfigurationOperations):
'license': D.FALLBACK_KEEP_UPDATE,
'number_of_pages': D.FALLBACK_KEEP_UPDATE,
'public_notes': D.FALLBACK_KEEP_UPDATE,

}


Expand All @@ -126,7 +127,7 @@ class ArxivOnPublisherOperations(MergerConfigurationOperations):
filter_figures_same_source,
filter_publisher_references,
update_authors_with_ordering_info,
clean_root_for_acquisition_source
clean_root_for_acquisition_source,
]
default_list_merge_op = U.KEEP_ONLY_HEAD_ENTITIES
conflict_filters = [
Expand Down Expand Up @@ -205,7 +206,7 @@ class PublisherOnArxivOperations(MergerConfigurationOperations):
filter_figures_same_source,
filter_curated_references,
update_authors_with_ordering_info,
clean_root_for_acquisition_source
clean_root_for_acquisition_source,
]
conflict_filters = [
'_collections',
Expand Down Expand Up @@ -255,7 +256,7 @@ class PublisherOnPublisherOperations(MergerConfigurationOperations):
filter_figures_same_source,
filter_curated_references,
update_authors_with_ordering_info,
clean_root_for_acquisition_source
clean_root_for_acquisition_source,
]
conflict_filters = [
'_collections',
Expand Down Expand Up @@ -294,7 +295,8 @@ class PublisherOnPublisherOperations(MergerConfigurationOperations):
'curated': D.FALLBACK_KEEP_HEAD,
'preprint_date': D.FALLBACK_KEEP_HEAD,
'report_numbers': D.FALLBACK_KEEP_HEAD,
# Fields bellow are merged and conflicts are ignored, so for those fields we stay with previous merge behaviour.
# Fields bellow are merged and conflicts are ignored,
# so for those fields we stay with previous merge behaviour.
'_files': D.FALLBACK_KEEP_UPDATE,
'acquisition_source': D.FALLBACK_KEEP_UPDATE,
'authors.raw_affiliations': D.FALLBACK_KEEP_UPDATE,
Expand All @@ -310,23 +312,15 @@ class PublisherOnPublisherOperations(MergerConfigurationOperations):
class GrobidOnArxivAuthorsOperations(MergerConfigurationOperations):
default_list_merge_op = U.KEEP_ONLY_HEAD_ENTITIES
default_dict_merge_op = D.FALLBACK_KEEP_HEAD
list_dict_ops = {
'authors.raw_affiliations': D.FALLBACK_KEEP_UPDATE
}
list_merge_ops = {
'authors.raw_affiliations': U.KEEP_ONLY_UPDATE_ENTITIES
}
list_dict_ops = {'authors.raw_affiliations': D.FALLBACK_KEEP_UPDATE}
list_merge_ops = {'authors.raw_affiliations': U.KEEP_ONLY_UPDATE_ENTITIES}
comparators = GROBID_ON_ARXIV_COMPARATORS
conflict_filters = ["authors.full_name"]


class ErratumOnPublisherOperations(MergerConfigurationOperations):
comparators = COMPARATORS
pre_filters = [
update_material,
update_authors_with_ordering_info,
remove_root
]
pre_filters = [update_material, update_authors_with_ordering_info, remove_root]
default_list_merge_op = U.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST
default_dict_merge_op = D.FALLBACK_KEEP_HEAD
list_merge_ops = {
Expand Down
Loading

0 comments on commit fce85a7

Please sign in to comment.