monarch-initiative · kevinschaper · Oct 2, 2024 · Oct 3, 2024
diff --git a/examples/string-linkml-writer/metadata.yaml b/examples/string-linkml-writer/metadata.yaml
@@ -0,0 +1,4 @@
+ingest_title: 'String DB'
+ingest_url: 'https://string-db.org'
+description: 'STRING: functional protein association networks'
+rights: 'https://string-db.org/cgi/access.pl?footer_active_subpage=licensing'
diff --git a/examples/string-linkml-writer/protein-links-detailed.py b/examples/string-linkml-writer/protein-links-detailed.py
@@ -0,0 +1,24 @@
+import uuid
+
+from biolink_model.datamodel.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Gene
+
+from koza.cli_utils import get_koza_app
+
+koza_app = get_koza_app('protein-links-detailed')
+koza_map = koza_app.get_map('entrez-2-string')
+
+while (row := koza_app.get_row()) is not None:
+    gene_a = Gene(id="NCBIGene:" + koza_map[row["protein1"]]["entrez"])
+    gene_b = Gene(id="NCBIGene:" + koza_map[row["protein2"]]["entrez"])
+
+    pairwise_gene_to_gene_interaction = PairwiseGeneToGeneInteraction(
+        id="uuid:" + str(uuid.uuid1()),
+        subject=gene_a.id,
+        object=gene_b.id,
+        predicate="biolink:interacts_with",
+        knowledge_level="not_provided",
+        agent_type="not_provided",
+    )
+    koza_app.write(gene_a, gene_b, writer="nodes")
+    koza_app.write(pairwise_gene_to_gene_interaction, writer="edges")
+    koza_app.write()
diff --git a/examples/string-linkml-writer/protein-links-detailed.yaml b/examples/string-linkml-writer/protein-links-detailed.yaml
@@ -0,0 +1,58 @@
+name: 'protein-links-detailed'
+
+delimiter: ' '
+
+files:
+  - './examples/data/string.tsv'
+  - './examples/data/string2.tsv'
+
+metadata: !include './examples/string/metadata.yaml'
+
+columns:
+  - 'protein1'
+  - 'protein2'
+  - 'neighborhood'
+  - 'fusion'
+  - 'cooccurence'
+  - 'coexpression'
+  - 'experimental'
+  - 'database'
+  - 'textmining'
+  - 'combined_score' : 'int'
+
+filters:
+  - inclusion: 'include'
+    column: 'combined_score'
+    filter_code: 'lt'
+    value: 700
+
+depends_on:
+  - './examples/maps/entrez-2-string.yaml'
+
+writers:
+  "nodes":
+    filename: 'protein_links_nodes.tsv'
+    linkml_schema: 'biolink_model.schema:biolink_model.yaml'
+    classes:
+      - 'Gene'
+  "edges":
+    filename: 'protein_links_edges.tsv'
+    linkml_schema: 'biolink_model.schema:biolink_model.yaml'
+    classes:
+      - 'PairwiseGeneToGeneInteraction'
+
+#node_properties:
+#  - 'id'
+#  - 'category'
+#  - 'provided_by'
+#
+#edge_properties:
+#  - 'id'
+#  - 'subject'
+#  - 'predicate'
+#  - 'object'
+#  - 'category'
+#  - 'relation'
+#  - 'provided_by'
+
+transform_mode: 'loop'
diff --git a/src/koza/app.py b/src/koza/app.py
@@ -5,7 +5,9 @@
 import yaml
 
 from linkml.validator import validate
+from openpyxl.styles.builtins import output
 from pydantic import ValidationError
+from sssom.cli import output_format_option
 
 from koza.converter.kgx_converter import KGXConverter
 from koza.utils.exceptions import MapItemException, NextRowException
@@ -42,6 +44,7 @@ def __init__(
         self._map_cache: Dict[str, Dict] = {}
         self.curie_cleaner: CurieCleaner = CurieCleaner()
         self.writer: KozaWriter = self._get_writer()
+        self.writers: Dict[str,KozaWriter] = self._get_writers()
         self.logger = logger
         self.outfiles = []
         if hasattr(self.writer, 'nodes_file_name'):
@@ -148,7 +151,7 @@ def next_row():
         """
         raise NextRowException
 
-    def write(self, *entities):
+    def write(self, *entities, writer=None):
         # If a schema/validator is defined, validate before writing
         # if self.validate:
         if hasattr(self, 'schema'):
@@ -170,6 +173,18 @@ def write(self, *entities):
 
         self.writer.write(entities)
 
+    def _get_writers(self) -> Dict[str,KozaWriter]:
+        return {}
+        # for writer_name in self.source.config.writers:
+        #     writer_config = self.source.config.writers[writer_name]
+        #     writer_params = {
+        #         "output_dir": self.output_dir,
+        #         self.source
+        #     }
+
+
+
+
     def _get_writer(self) -> Union[TSVWriter, JSONLWriter]:
         writer_params = [
             self.output_dir,

diff --git a/src/koza/io/writer/linkml_writer.py b/src/koza/io/writer/linkml_writer.py
@@ -0,0 +1,87 @@
+import json
+from pathlib import Path
+from typing import Union, List
+from linkml_runtime import SchemaView
+from linkml_runtime.utils.formatutils import camelcase, uncamelcase, underscore
+
+from koza.io.utils import build_export_row
+from koza.io.writer.writer import KozaWriter
+from koza.model.config.source_config import OutputFormat
+from koza.model.config.sssom_config import SSSOMConfig
+
+
+class LinkMLWriter(KozaWriter):
+
+    def __init__(self,
+                 output_dir: Union[str,Path],
+                 filename: str,
+                 schemaview: SchemaView,
+                 class_names: List[str],
+                 sssom_config: SSSOMConfig = None
+                 ):
+        self.fh = open(Path(output_dir, filename), 'w')
+        self.sv = schemaview
+        self.slots = self.get_slot_names(class_names)
+        self.sssom_config = sssom_config
+        self.rows = []
+        self.used_slots = set()
+        self.output_format = self.get_output_format(filename)
+        self.delimiter = "\t"
+        # TODO: pass delimiter and list_delimiter as arguments
+        self.delimiter = "\t"
+        self.list_delimiter = "|"
+
+
+
+    def write(self, record):
+        #TODO: add assertion about the class of record?
+        export_row = build_export_row(record.dict(), list_delimiter=self.list_delimiter)
+        self.rows.append(export_row)
+        self.used_slots.update(export_row.keys())
+
+    def finalize(self):
+        # todo: sort the slots in an external function that looks at the schema, applies sensible defaults about identifier, type designator & label slots , etc
+        ordered_slots = self.sort_slots(self.used_slots)
+        if (self.output_format == OutputFormat.tsv):
+            # write the header
+            self.fh.write(self.delimiter.join(ordered_slots) + "\n")
+        for export_row in self.rows:
+            if self.output_format == OutputFormat.tsv:
+                ordered_values = [export_row[slot] if slot in export_row else None for slot in ordered_slots]
+                self.fh.write(self.delimiter.join(ordered_values) + "\n")
+            elif self.output_format == OutputFormat.jsonl:
+                self.fh.write(json.dumps(export_row) + "\n")
+
+        self.fh.close()
+
+    def sort_slots(self, slots: List[str]) -> List[str]:
+        # TODO: generalize this a little more, at least biolink vs sssom, also try using rank
+        # sort the slots with a specific order for some slots and the rest alphabetically
+        specific_order = ['id', 'category', 'subject', 'predicate', 'object']
+        ordered_slots = [slot for slot in specific_order if slot in slots]
+        remaining_slots = sorted(set(slots) - set(specific_order))
+        return ordered_slots + remaining_slots
+
+    def get_class(self, cn: str) -> str:
+        """ Get class from SchemaView being flexible about how the clas name is formatted"""
+        class_definition = self.sv.get_class(cn)
+        if class_definition is None:
+            class_definition = self.sv.get_class(camelcase(cn))
+        if class_definition is None:
+            class_definition = self.sv.get_class(uncamelcase(cn))
+        if class_definition is None:
+            raise ValueError(f"Class {cn} not found in schema")
+        return class_definition
+
+    def get_slot_names(self, class_names: List[str]) -> List[str]:
+        sv = self.sv
+        slots = set()
+        for cn in class_names:
+            class_definition = self.get_class(cn)
+            for slot in sv.class_induced_slots(class_definition.name):
+                slots.add(slot.name)
+        # convert to underscore
+        return [underscore(slot) for slot in slots]
+
+    def get_output_format(self, filename: str) -> OutputFormat:
+        return OutputFormat(Path(filename).suffix[1:])
diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py
@@ -18,3 +18,4 @@ def write(self, entities: Iterable):
     @abstractmethod
     def finalize(self):
         pass
+
diff --git a/src/koza/model/config/source_config.py b/src/koza/model/config/source_config.py
@@ -12,7 +12,7 @@
 from typing import Dict, List, Union, Optional
 import yaml
 
-from pydantic import StrictFloat, StrictInt, StrictStr
+from pydantic import StrictFloat, StrictInt, StrictStr, BaseModel, Field
 from pydantic.dataclasses import dataclass
 
 from koza.model.config.pydantic_config import PYDANTIC_CONFIG
@@ -128,6 +128,11 @@ class DatasetDescription:
     # license: Optional[str] = None     # Possibly redundant, same as rights
     rights: Optional[str] = None  # License information for the data source
 
+@dataclass(config=PYDANTIC_CONFIG)
+class LinkMLWriter(BaseModel):
+    filename:str = Field(..., description="The file that this writer should write to")
+    linkml_schema:str = Field(..., description="Path to the schema file? url? python package? TODO: figure this out") # TODO
+    classes: List[str] = Field([], description="List of classes within the schema that will contribute")
 
 @dataclass(config=PYDANTIC_CONFIG)
 class SourceConfig:
@@ -180,6 +185,7 @@ class SourceConfig:
     transform_mode: TransformMode = TransformMode.flat
     global_table: Optional[Union[str, Dict]] = None
     local_table: Optional[Union[str, Dict]] = None
+    writers: Optional[Dict[str,LinkMLWriter]] = None
 
     def extract_archive(self):
         archive_path = Path(self.file_archive).parent  # .absolute()
@@ -321,3 +327,4 @@ class MapFileConfig(SourceConfig):
     curie_prefix: Optional[str] = None
     add_curie_prefix_to_columns: Optional[List[str]] = None
     depends_on: Optional[List[str]] = None
+
diff --git a/tests/unit/test_linkml_writer.py b/tests/unit/test_linkml_writer.py
@@ -0,0 +1,73 @@
+import uuid
+
+import pytest
+from unittest.mock import mock_open, patch
+
+from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation
+from linkml_runtime import SchemaView
+import importlib
+
+from pyparsing import pyparsing_test
+
+from koza.io.writer.linkml_writer import LinkMLWriter
+from koza.model.config.source_config import OutputFormat
+
+
+# setup the tests by creating a schemaview for biolink model
+
+@pytest.fixture
+def sv():
+    return SchemaView(importlib.resources.files("biolink_model.schema").joinpath("biolink_model.yaml"))
+
+@pytest.fixture
+def d2p_writer(sv):
+    return LinkMLWriter(
+        output_dir=".",
+        filename="test.tsv",
+        schemaview=sv,
+        class_names=["DiseaseToPhenotypicFeatureAssociation"]
+        )
+
+def test_get_slot_names(d2p_writer):
+    expected_slot_subset = ['id', 'category', 'subject', 'predicate', 'object', 'frequency_qualifier', 'has_total', 'has_count', 'has_percentage', 'qualifiers', 'primary_knowledge_source', 'aggregator_knowledge_source']
+    missing_slots = set(expected_slot_subset) - set(d2p_writer.slots)
+    assert not missing_slots, f"Missing slots: {missing_slots}"
+
+@pytest.mark.parametrize(
+    "value, class_name", [
+        ("GeneToDiseaseAssociation", "gene to disease association"),
+        ("gene to disease association", "gene to disease association")
+    ]
+)
+def test_get_class(d2p_writer, value, class_name):
+    assert d2p_writer.get_class(value).name == class_name
+
+def test_get_output_format(d2p_writer):
+    assert d2p_writer.get_output_format("test.tsv") == OutputFormat.tsv
+    assert d2p_writer.get_output_format("test.jsonl") == OutputFormat.jsonl
+
+# test the write method using a mock file handle
+# @pytest.mark.skip # this is
+def test_write(d2p_writer):
+    mock_file = mock_open()
+    d2p_writer.fh = mock_file()
+    with patch("builtins.open", mock_file):
+        d2p_writer.write(GeneToPhenotypicFeatureAssociation(
+            id="1",
+            subject='MONDO:0005148',
+            predicate='biolink:has_phenotype',
+            object='HP:0007354',
+            primary_knowledge_source='infores:hpo-annotations',
+            knowledge_level='not_provided',
+            agent_type='not_provided'
+        ))
+        d2p_writer.finalize()
+
+        # assert the expected header
+        written_lines = mock_file().write.call_args_list
+        written_header = written_lines[0][0][0].replace('\n','').split('\t')
+        # publications, qualifiers and type are not actually used, their presence here comes from kgx/biolink assumptions in build_export_row that need to be fixed
+        assert written_header == ['id', 'category', 'subject', 'predicate', 'object', 'agent_type', 'knowledge_level', 'primary_knowledge_source', 'publications', 'qualifiers','type']
+        # assert the expected data row
+        written_data_row = written_lines[1][0][0].replace('\n','').split('\t')
+        assert written_data_row == ['1', 'biolink:GeneToPhenotypicFeatureAssociation', 'MONDO:0005148', 'biolink:has_phenotype', 'HP:0007354', 'not_provided', 'not_provided', 'infores:hpo-annotations', '', '', '']
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,3 +18,4 @@ def write(self, entities: Iterable):
		@abstractmethod
		def finalize(self):
		pass