Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LinkML based writer support, with explicit definitions of multiple writers #148

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/string-linkml-writer/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ingest_title: 'String DB'
ingest_url: 'https://string-db.org'
description: 'STRING: functional protein association networks'
rights: 'https://string-db.org/cgi/access.pl?footer_active_subpage=licensing'
24 changes: 24 additions & 0 deletions examples/string-linkml-writer/protein-links-detailed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import uuid

from biolink_model.datamodel.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Gene

from koza.cli_utils import get_koza_app

koza_app = get_koza_app('protein-links-detailed')
koza_map = koza_app.get_map('entrez-2-string')

while (row := koza_app.get_row()) is not None:
gene_a = Gene(id="NCBIGene:" + koza_map[row["protein1"]]["entrez"])
gene_b = Gene(id="NCBIGene:" + koza_map[row["protein2"]]["entrez"])

pairwise_gene_to_gene_interaction = PairwiseGeneToGeneInteraction(
id="uuid:" + str(uuid.uuid1()),
subject=gene_a.id,
object=gene_b.id,
predicate="biolink:interacts_with",
knowledge_level="not_provided",
agent_type="not_provided",
)
koza_app.write(gene_a, gene_b, writer="nodes")
koza_app.write(pairwise_gene_to_gene_interaction, writer="edges")
koza_app.write()
58 changes: 58 additions & 0 deletions examples/string-linkml-writer/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: 'protein-links-detailed'

delimiter: ' '

files:
- './examples/data/string.tsv'
- './examples/data/string2.tsv'

metadata: !include './examples/string/metadata.yaml'

columns:
- 'protein1'
- 'protein2'
- 'neighborhood'
- 'fusion'
- 'cooccurence'
- 'coexpression'
- 'experimental'
- 'database'
- 'textmining'
- 'combined_score' : 'int'

filters:
- inclusion: 'include'
column: 'combined_score'
filter_code: 'lt'
value: 700

depends_on:
- './examples/maps/entrez-2-string.yaml'

writers:
"nodes":
filename: 'protein_links_nodes.tsv'
linkml_schema: 'biolink_model.schema:biolink_model.yaml'
classes:
- 'Gene'
"edges":
filename: 'protein_links_edges.tsv'
linkml_schema: 'biolink_model.schema:biolink_model.yaml'
classes:
- 'PairwiseGeneToGeneInteraction'

#node_properties:
# - 'id'
# - 'category'
# - 'provided_by'
#
#edge_properties:
# - 'id'
# - 'subject'
# - 'predicate'
# - 'object'
# - 'category'
# - 'relation'
# - 'provided_by'

transform_mode: 'loop'
17 changes: 16 additions & 1 deletion src/koza/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import yaml

from linkml.validator import validate
from openpyxl.styles.builtins import output
from pydantic import ValidationError
from sssom.cli import output_format_option

from koza.converter.kgx_converter import KGXConverter
from koza.utils.exceptions import MapItemException, NextRowException
Expand Down Expand Up @@ -42,6 +44,7 @@ def __init__(
self._map_cache: Dict[str, Dict] = {}
self.curie_cleaner: CurieCleaner = CurieCleaner()
self.writer: KozaWriter = self._get_writer()
self.writers: Dict[str,KozaWriter] = self._get_writers()
self.logger = logger
self.outfiles = []
if hasattr(self.writer, 'nodes_file_name'):
Expand Down Expand Up @@ -148,7 +151,7 @@ def next_row():
"""
raise NextRowException

def write(self, *entities):
def write(self, *entities, writer=None):
# If a schema/validator is defined, validate before writing
# if self.validate:
if hasattr(self, 'schema'):
Expand All @@ -170,6 +173,18 @@ def write(self, *entities):

self.writer.write(entities)

def _get_writers(self) -> Dict[str,KozaWriter]:
return {}
# for writer_name in self.source.config.writers:
# writer_config = self.source.config.writers[writer_name]
# writer_params = {
# "output_dir": self.output_dir,
# self.source
# }




def _get_writer(self) -> Union[TSVWriter, JSONLWriter]:
writer_params = [
self.output_dir,
Expand Down
87 changes: 87 additions & 0 deletions src/koza/io/writer/linkml_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json
from pathlib import Path
from typing import Union, List
from linkml_runtime import SchemaView
from linkml_runtime.utils.formatutils import camelcase, uncamelcase, underscore

from koza.io.utils import build_export_row
from koza.io.writer.writer import KozaWriter
from koza.model.config.source_config import OutputFormat
from koza.model.config.sssom_config import SSSOMConfig


class LinkMLWriter(KozaWriter):

def __init__(self,
output_dir: Union[str,Path],
filename: str,
schemaview: SchemaView,
class_names: List[str],
sssom_config: SSSOMConfig = None
):
self.fh = open(Path(output_dir, filename), 'w')
self.sv = schemaview
self.slots = self.get_slot_names(class_names)
self.sssom_config = sssom_config
self.rows = []
self.used_slots = set()
self.output_format = self.get_output_format(filename)
self.delimiter = "\t"
# TODO: pass delimiter and list_delimiter as arguments
self.delimiter = "\t"
self.list_delimiter = "|"



def write(self, record):
#TODO: add assertion about the class of record?
export_row = build_export_row(record.dict(), list_delimiter=self.list_delimiter)
self.rows.append(export_row)
self.used_slots.update(export_row.keys())

def finalize(self):
# todo: sort the slots in an external function that looks at the schema, applies sensible defaults about identifier, type designator & label slots , etc
ordered_slots = self.sort_slots(self.used_slots)
if (self.output_format == OutputFormat.tsv):
# write the header
self.fh.write(self.delimiter.join(ordered_slots) + "\n")
for export_row in self.rows:
if self.output_format == OutputFormat.tsv:
ordered_values = [export_row[slot] if slot in export_row else None for slot in ordered_slots]
self.fh.write(self.delimiter.join(ordered_values) + "\n")
elif self.output_format == OutputFormat.jsonl:
self.fh.write(json.dumps(export_row) + "\n")

self.fh.close()

def sort_slots(self, slots: List[str]) -> List[str]:
# TODO: generalize this a little more, at least biolink vs sssom, also try using rank
# sort the slots with a specific order for some slots and the rest alphabetically
specific_order = ['id', 'category', 'subject', 'predicate', 'object']
ordered_slots = [slot for slot in specific_order if slot in slots]
remaining_slots = sorted(set(slots) - set(specific_order))
return ordered_slots + remaining_slots

def get_class(self, cn: str) -> str:
""" Get class from SchemaView being flexible about how the clas name is formatted"""
class_definition = self.sv.get_class(cn)
if class_definition is None:
class_definition = self.sv.get_class(camelcase(cn))
if class_definition is None:
class_definition = self.sv.get_class(uncamelcase(cn))
if class_definition is None:
raise ValueError(f"Class {cn} not found in schema")
return class_definition

def get_slot_names(self, class_names: List[str]) -> List[str]:
sv = self.sv
slots = set()
for cn in class_names:
class_definition = self.get_class(cn)
for slot in sv.class_induced_slots(class_definition.name):
slots.add(slot.name)
# convert to underscore
return [underscore(slot) for slot in slots]

def get_output_format(self, filename: str) -> OutputFormat:
return OutputFormat(Path(filename).suffix[1:])
1 change: 1 addition & 0 deletions src/koza/io/writer/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ def write(self, entities: Iterable):
@abstractmethod
def finalize(self):
pass

9 changes: 8 additions & 1 deletion src/koza/model/config/source_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Dict, List, Union, Optional
import yaml

from pydantic import StrictFloat, StrictInt, StrictStr
from pydantic import StrictFloat, StrictInt, StrictStr, BaseModel, Field
from pydantic.dataclasses import dataclass

from koza.model.config.pydantic_config import PYDANTIC_CONFIG
Expand Down Expand Up @@ -128,6 +128,11 @@ class DatasetDescription:
# license: Optional[str] = None # Possibly redundant, same as rights
rights: Optional[str] = None # License information for the data source

@dataclass(config=PYDANTIC_CONFIG)
class LinkMLWriter(BaseModel):
filename:str = Field(..., description="The file that this writer should write to")
linkml_schema:str = Field(..., description="Path to the schema file? url? python package? TODO: figure this out") # TODO
classes: List[str] = Field([], description="List of classes within the schema that will contribute")

@dataclass(config=PYDANTIC_CONFIG)
class SourceConfig:
Expand Down Expand Up @@ -180,6 +185,7 @@ class SourceConfig:
transform_mode: TransformMode = TransformMode.flat
global_table: Optional[Union[str, Dict]] = None
local_table: Optional[Union[str, Dict]] = None
writers: Optional[Dict[str,LinkMLWriter]] = None

def extract_archive(self):
archive_path = Path(self.file_archive).parent # .absolute()
Expand Down Expand Up @@ -321,3 +327,4 @@ class MapFileConfig(SourceConfig):
curie_prefix: Optional[str] = None
add_curie_prefix_to_columns: Optional[List[str]] = None
depends_on: Optional[List[str]] = None

73 changes: 73 additions & 0 deletions tests/unit/test_linkml_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import uuid

import pytest
from unittest.mock import mock_open, patch

from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation
from linkml_runtime import SchemaView
import importlib

from pyparsing import pyparsing_test

from koza.io.writer.linkml_writer import LinkMLWriter
from koza.model.config.source_config import OutputFormat


# setup the tests by creating a schemaview for biolink model

@pytest.fixture
def sv():
return SchemaView(importlib.resources.files("biolink_model.schema").joinpath("biolink_model.yaml"))

@pytest.fixture
def d2p_writer(sv):
return LinkMLWriter(
output_dir=".",
filename="test.tsv",
schemaview=sv,
class_names=["DiseaseToPhenotypicFeatureAssociation"]
)

def test_get_slot_names(d2p_writer):
expected_slot_subset = ['id', 'category', 'subject', 'predicate', 'object', 'frequency_qualifier', 'has_total', 'has_count', 'has_percentage', 'qualifiers', 'primary_knowledge_source', 'aggregator_knowledge_source']
missing_slots = set(expected_slot_subset) - set(d2p_writer.slots)
assert not missing_slots, f"Missing slots: {missing_slots}"

@pytest.mark.parametrize(
"value, class_name", [
("GeneToDiseaseAssociation", "gene to disease association"),
("gene to disease association", "gene to disease association")
]
)
def test_get_class(d2p_writer, value, class_name):
assert d2p_writer.get_class(value).name == class_name

def test_get_output_format(d2p_writer):
assert d2p_writer.get_output_format("test.tsv") == OutputFormat.tsv
assert d2p_writer.get_output_format("test.jsonl") == OutputFormat.jsonl

# test the write method using a mock file handle
# @pytest.mark.skip # this is
def test_write(d2p_writer):
mock_file = mock_open()
d2p_writer.fh = mock_file()
with patch("builtins.open", mock_file):
d2p_writer.write(GeneToPhenotypicFeatureAssociation(
id="1",
subject='MONDO:0005148',
predicate='biolink:has_phenotype',
object='HP:0007354',
primary_knowledge_source='infores:hpo-annotations',
knowledge_level='not_provided',
agent_type='not_provided'
))
d2p_writer.finalize()

# assert the expected header
written_lines = mock_file().write.call_args_list
written_header = written_lines[0][0][0].replace('\n','').split('\t')
# publications, qualifiers and type are not actually used, their presence here comes from kgx/biolink assumptions in build_export_row that need to be fixed
assert written_header == ['id', 'category', 'subject', 'predicate', 'object', 'agent_type', 'knowledge_level', 'primary_knowledge_source', 'publications', 'qualifiers','type']
# assert the expected data row
written_data_row = written_lines[1][0][0].replace('\n','').split('\t')
assert written_data_row == ['1', 'biolink:GeneToPhenotypicFeatureAssociation', 'MONDO:0005148', 'biolink:has_phenotype', 'HP:0007354', 'not_provided', 'not_provided', 'infores:hpo-annotations', '', '', '']
Loading