fix problem with db tables derived from files with underscores in the…

… filename
cldf · Mar 18, 2024 · 27b44ce · 27b44ce
1 parent 944a86e
commit 27b44ce
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+Fixed bug whereby component names where the CSV filenames contain underscores were not translated
+appropriately when creating the SQLite db. (Note that this fix is required for the ParameterNetwork
+component in CLDF 1.3.)
+
 
 ## [1.37.0] - 2024-01-22
 

diff --git a/src/pycldf/db.py b/src/pycldf/db.py
@@ -108,12 +108,39 @@ def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> st
             # A simple, translateable column name.
             return d[table].columns[col]
         if '_' in col:
-            t, _, c = col.partition('_')
+            parts = col.split('_')
+            t = '_'.join(parts[:-1])
+            c = parts[-1]
             if t in table and t in d and c in d[t].columns:
                 # A generated column name of an association table.
                 return '_'.join([d[t].name or t, d[t].columns[c]])
         return col
-    return '_'.join([(d[t].name or t) if t in d else t for t in table.split('_')])
+
+    # To handle association tables, we proceed as follows:
+    # 1. We split the full table name on underscores - the separators of sub-table names in
+    # association tables.
+    # 2. Since regular table names may contain underscores as well, we try to find the longest
+    # concatenation of _-separated name parts which appears in the translation dict.
+    # 3. We repeat step 2 until all name parts have been consumed.
+    def t(n):
+        if n in d:
+            return d[n].name or n
+        tables, comps = [], n.split('_')
+        while comps and len(comps) > 1:
+            for i in range(len(comps), 0, -1):
+                nc = '_'.join(comps[:i])
+                if nc in d:
+                    tables.append(d[nc].name or nc)
+                    comps = comps[i:]
+                    break
+            else:
+                raise ValueError(n)  # pragma: no cover
+        if comps:
+            assert len(comps) == 1
+            tables.append(d[comps[0]].name or comps[0] if comps[0] in d else comps[0])
+        return '_'.join(tables)
+
+    return t(table)
 
 
 def clean_bibtex_key(s):

diff --git a/tests/test_db.py b/tests/test_db.py
@@ -44,15 +44,35 @@ def test_db_write(tmp_path, data):
 
 def test_db_write_extra_tables(md):
     ds = Generic.in_dir(md.parent)
-    ds.add_table('extra.csv', 'ID', 'Name', {'name': 'x', 'separator': '#'})
-    ds.write(md, **{'extra.csv': [dict(ID=1, Name='Name', x=['a', 'b', 'c'])]})
+    ds.add_table(
+        'ext_ra.csv',
+        {'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id'},
+        'Name', {'name': 'x', 'separator': '#'})
+    ds.add_component('ParameterTable')
+    ds.add_component(
+        'ParameterNetwork',
+        {
+            'name': 'Source',
+            'separator': ';',
+            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source'},
+        {'name': 'ex', 'separator': ' '},
+    )
+    ds.add_foreign_key('ParameterNetwork', 'ex', 'ext_ra.csv', 'ID')
+    ds.write(md, **{
+        'ParameterTable': [dict(ID='p')],
+        'ParameterNetwork': [
+            dict(ID='e', Target_Parameter_ID='p', Source_Parameter_ID='p', ex=['1'])],
+        'ext_ra.csv': [dict(ID='1', Name='Name', x=['a', 'b', 'c'])]})
 
     db = Database(ds, fname=md.parent / 'db.sqlite')
     db.write_from_tg()
-    rows = db.query("""select x from "extra.csv" """)
+    rows = db.query("""select x from "ext_ra.csv" """)
     assert len(rows) == 1
     assert rows[0][0] == 'a#b#c'
-    assert db.split_value('extra.csv', 'x', rows[0][0]) == ['a', 'b', 'c']
+    assert db.split_value('ext_ra.csv', 'x', rows[0][0]) == ['a', 'b', 'c']
+    # We check that association tables where filenames for both components contain underscores work:
+    rows = db.query("""select count(*) from "ParameterNetwork_ext_ra.csv" """)
+    assert len(rows) == 1
 
 
 def test_db_write_extra_columns(md):
@@ -130,6 +150,8 @@ def translations():
     return {
         'forms.csv': TableTranslation(columns={'pk': 'id'}),
         'parameters.csv': TableTranslation(name='PTable', columns={'pk': 'id'}),
+        'a_b.csv': TableTranslation(name='ab'),
+        'c_d.csv': TableTranslation(name='cd'),
     }
 
 
@@ -141,6 +163,8 @@ def translations():
         ('forms.csv_parameters.csv', None, 'forms.csv_PTable'),
         ('forms.csv_parameters.csv', 'parameters.csv_pk', 'PTable_id'),
         ('forms.csv_parameters.csv', 'forms.csv_pk', 'forms.csv_id'),
+        ('a_b.csv_c_d.csv', None, 'ab_cd'),
+        ('a_b.csv_cd.csv', None, 'ab_cd.csv'),
     ]
 )
 def test_translate(translations, table, col, expected):