[ALS-7113] Enhance sequential loader

hms-dbmi · Feb 13, 2025 · f570131 · f570131
1 parent d6f1ec4
commit f570131
Show file tree

Hide file tree

Showing 24 changed files with 901 additions and 83 deletions.
diff --git a/docker/pic-sure-hpds-etl/Dockerfile b/docker/pic-sure-hpds-etl/Dockerfile
@@ -28,6 +28,7 @@ COPY --from=build /app/docker/pic-sure-hpds-etl/RekeyDataset-jar-with-dependenci
 COPY --from=build /app/docker/pic-sure-hpds-etl/RemoveConceptFromMetadata-jar-with-dependencies.jar .
 COPY --from=build /app/docker/pic-sure-hpds-etl/HideAnnotationCategoryValue-jar-with-dependencies.jar .
 COPY --from=build /app/docker/pic-sure-hpds-etl/SequentialLoader-jar-with-dependencies.jar .
+COPY --from=build /app/docker/pic-sure-hpds-etl/LowRAMMultiCSVLoader-jar-with-dependencies.jar .
 COPY --from=build /app/docker/pic-sure-hpds-etl/create_key.sh .
 
 ENTRYPOINT ["sh", "-c", "java $JAVA_OPTS -Xmx${HEAPSIZE:-2048}m -jar ${LOADER_NAME:-CSVLoader}-jar-with-dependencies.jar"]
diff --git a/etl/pom.xml b/etl/pom.xml
@@ -135,7 +135,7 @@
 						<configuration>
 							<archive>
 								<manifest>
-									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.CSVLoaderNewSearch</mainClass>
+									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv.CSVLoaderNewSearch</mainClass>
 								</manifest>
 							</archive>
 							<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
@@ -155,7 +155,7 @@
 						<configuration>
 							<archive>
 								<manifest>
-									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.CSVLoader</mainClass>
+									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv.CSVLoader</mainClass>
 								</manifest>
 							</archive>
 							<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
@@ -175,7 +175,7 @@
 						<configuration>
 							<archive>
 								<manifest>
-									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.SQLLoader</mainClass>
+									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.sql.SQLLoader</mainClass>
 								</manifest>
 							</archive>
 							<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
@@ -195,7 +195,7 @@
 						<configuration>
 							<archive>
 								<manifest>
-									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.SequentialLoader</mainClass>
+									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.sequential.SequentialLoader</mainClass>
 								</manifest>
 							</archive>
 							<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
@@ -210,6 +210,26 @@
 							<goal>single</goal>
 						</goals>
 					</execution>
+					<execution>
+						<id>buildLiteCSVLoader</id>
+						<configuration>
+							<archive>
+								<manifest>
+									<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.litecsv.LowRAMMultiCSVLoader</mainClass>
+								</manifest>
+							</archive>
+							<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
+							<descriptorRefs>
+								<descriptorRef>jar-with-dependencies</descriptorRef>
+							</descriptorRefs>
+							<classifier>LowRAMMultiCSVLoader</classifier>
+							<finalName>LowRAMMultiCSVLoader</finalName>
+						</configuration>
+						<phase>package</phase>
+						<goals>
+							<goal>single</goal>
+						</goals>
+					</execution>
 					<execution>
 						<id>buildMultialleleCounter</id>
 						<configuration>

diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java
@@ -1,39 +1,22 @@
 package edu.harvard.hms.dbmi.avillach.hpds.data.genotype.util;
 
-import java.io.ByteArrayInputStream;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.FileReader;
-import java.io.FileWriter;
 import java.io.IOException;
-import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
-import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
-import java.util.Map.Entry;
-import java.util.concurrent.ExecutionException;
-import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 
-import com.google.common.cache.CacheBuilder;
-import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
-import com.google.common.collect.ImmutableList;
 
 import de.siegmar.fastcsv.reader.CsvContainer;
 import de.siegmar.fastcsv.reader.CsvReader;
 import de.siegmar.fastcsv.reader.CsvRow;
-import de.siegmar.fastcsv.writer.CsvWriter;
-import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
 import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
-import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
 
 public class RemapPatientIds {
 	protected static LoadingCache<String, PhenoCube<?>> store;

diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/DumpSourceCSV.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/DumpSourceCSV.java
@@ -21,7 +21,7 @@
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 
 public class DumpSourceCSV {
 	protected static LoadingCache<String, PhenoCube<?>> store;

diff --git a/...n/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/FixCategoricalConcepts.java b/...n/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/FixCategoricalConcepts.java
@@ -18,7 +18,7 @@
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 
 @SuppressWarnings({"unchecked", "rawtypes"})
 public class FixCategoricalConcepts {

diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RekeyDataset.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RekeyDataset.java
@@ -18,7 +18,7 @@
 import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 
 @SuppressWarnings({"unchecked"})
 public class RekeyDataset {

diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RemapIds.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RemapIds.java
@@ -18,7 +18,7 @@
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 
 @SuppressWarnings({"unchecked", "rawtypes"})
 public class RemapIds {

diff --git a/...rc/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RenameCategories.java b/...rc/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/phenotype/util/RenameCategories.java
@@ -20,7 +20,7 @@
 import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 
 public class RenameCategories {
 

diff --git a/...lach/hpds/etl/phenotype/LoadingStore.java → .../dbmi/avillach/hpds/etl/LoadingStore.java b/...lach/hpds/etl/phenotype/LoadingStore.java → .../dbmi/avillach/hpds/etl/LoadingStore.java
@@ -1,4 +1,4 @@
-package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
+package edu.harvard.hms.dbmi.avillach.hpds.etl;
 
 import java.io.*;
 import java.nio.file.Files;

diff --git a/...villach/hpds/etl/phenotype/CSVLoader.java → ...ach/hpds/etl/phenotype/csv/CSVLoader.java b/...villach/hpds/etl/phenotype/CSVLoader.java → ...ach/hpds/etl/phenotype/csv/CSVLoader.java
@@ -1,9 +1,10 @@
-package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
+package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;
 
 import java.io.*;
 import java.util.Date;
 import java.util.concurrent.ExecutionException;
 
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVRecord;
 import org.slf4j.Logger;

diff --git a/...pds/etl/phenotype/CSVLoaderNewSearch.java → ...etl/phenotype/csv/CSVLoaderNewSearch.java b/...pds/etl/phenotype/CSVLoaderNewSearch.java → ...etl/phenotype/csv/CSVLoaderNewSearch.java
@@ -1,7 +1,8 @@
-package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
+package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;
 
 import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
 import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
+import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVRecord;
 import org.slf4j.Logger;
@@ -17,16 +18,6 @@ public class CSVLoaderNewSearch {
 
     private static final Logger log = LoggerFactory.getLogger(CSVLoaderNewSearch.class);
 
-    private static final int PATIENT_NUM = 0;
-
-    private static final int CONCEPT_PATH = 1;
-
-    private static final int NUMERIC_VALUE = 2;
-
-    private static final int TEXT_VALUE = 3;
-
-    private static final int DATETIME = 4;
-
     private static boolean DO_VARNAME_ROLLUP = false;
 
     private static final String HPDS_DIRECTORY = "/opt/local/hpds/";
@@ -56,24 +47,24 @@ private static void initialLoad() throws IOException {
 
     private static void processRecord(final PhenoCube[] currentConcept, CSVRecord record) {
         if (record.size() < 4) {
-            log.info("Record number {} had less records than we expected so we are skipping it.", record.getRecordNumber());
+            log.info("Record number {} had less records than we exgpected so we are skipping it.", record.getRecordNumber());
             return;
         }
 
-        String conceptPath = getSanitizedConceptPath(record);
-        String numericValue = record.get(NUMERIC_VALUE);
+        String conceptPath = CSVParserUtil.parseConceptPath(record, DO_VARNAME_ROLLUP);
+        String numericValue = record.get(CSVParserUtil.NUMERIC_VALUE);
         boolean isAlpha = (numericValue == null || numericValue.isEmpty());
-        String value = isAlpha ? record.get(TEXT_VALUE) : numericValue;
+        String value = isAlpha ? record.get(CSVParserUtil.TEXT_VALUE) : numericValue;
         currentConcept[0] = getPhenoCube(currentConcept[0], conceptPath, isAlpha);
 
         if (value != null && !value.trim().isEmpty() &&
             ((isAlpha && currentConcept[0].vType == String.class) || (!isAlpha && currentConcept[0].vType == Double.class))) {
             value = value.trim();
             currentConcept[0].setColumnWidth(isAlpha ? Math.max(currentConcept[0].getColumnWidth(), value.getBytes().length) : Double.BYTES);
-            int patientId = Integer.parseInt(record.get(PATIENT_NUM));
+            int patientId = Integer.parseInt(record.get(CSVParserUtil.PATIENT_NUM));
             Date date = null;
-            if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) {
-                date = new Date(Long.parseLong(record.get(DATETIME)));
+            if (record.size() > 4 && record.get(CSVParserUtil.DATETIME) != null && !record.get(CSVParserUtil.DATETIME).isEmpty()) {
+                date = new Date(Long.parseLong(record.get(CSVParserUtil.DATETIME)));
             }
             currentConcept[0].add(patientId, isAlpha ? value : Double.parseDouble(value), date);
             store.allIds.add(patientId);
@@ -91,28 +82,4 @@ private static PhenoCube getPhenoCube(PhenoCube currentConcept, String conceptPa
 
         return currentConcept;
     }
-
-    private static String getSanitizedConceptPath(CSVRecord record) {
-        String conceptPathFromRow = record.get(CONCEPT_PATH);
-        String[] segments = conceptPathFromRow.split("\\\\");
-        for (int x = 0; x < segments.length; x++) {
-            segments[x] = segments[x].trim();
-        }
-        conceptPathFromRow = String.join("\\", segments) + "\\";
-        conceptPathFromRow = conceptPathFromRow.replaceAll("\\ufffd", "");
-        String textValueFromRow = record.get(TEXT_VALUE) == null ? null : record.get(TEXT_VALUE).trim();
-        if (textValueFromRow != null) {
-            textValueFromRow = textValueFromRow.replaceAll("\\ufffd", "");
-        }
-        String conceptPath;
-
-        if (DO_VARNAME_ROLLUP) {
-            conceptPath = conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\") ? conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\") : conceptPathFromRow;
-        } else {
-            conceptPath = conceptPathFromRow;
-        }
-        return conceptPath;
-    }
-
-
 }
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/csv/CSVParserUtil.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/csv/CSVParserUtil.java
@@ -0,0 +1,55 @@
+package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;
+
+import org.apache.commons.csv.CSVRecord;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.stream.Collectors;
+
+public class CSVParserUtil {
+    public static final int PATIENT_NUM = 0;
+    public static final int CONCEPT_PATH = 1;
+    public static final int NUMERIC_VALUE = 2;
+    public static final int TEXT_VALUE = 3;
+    public static final int DATETIME = 4;
+
+    public static String parseConceptPath(CSVRecord record, boolean doVarNameRollup) {
+        String conceptPathFromRow = record.get(CONCEPT_PATH);
+        conceptPathFromRow = Arrays.stream(conceptPathFromRow.split("\\\\"))
+            .map(String::trim)
+            .collect(Collectors.joining("\\")) + "\\";
+        conceptPathFromRow = stripWeirdUnicodeChars(conceptPathFromRow);
+
+        // \\ufffd = �
+        String textValueFromRow = stripWeirdUnicodeChars(trim(record.get(TEXT_VALUE)));
+        if (doVarNameRollup && conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\")) {
+            // This regex deletes the last node from the concept path, i.e. "rolling it up"
+            return conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\");
+        } else {
+            return conceptPathFromRow;
+        }
+    }
+
+    private static String stripWeirdUnicodeChars(@Nonnull String raw) {
+        return raw.replaceAll("\\ufffd", "");
+    }
+
+    public static String trim(@Nullable String maybeString) {
+        return maybeString == null ? "" : maybeString.trim();
+    }
+
+    public static @Nullable Date parseDate(CSVRecord record) {
+        Date date = null;
+        try {
+            if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) {
+                date = new Date(Long.parseLong(record.get(DATETIME)));
+            }
+        } catch (NumberFormatException e) {
+            return null;
+        }
+
+        return date;
+    }
+}
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/litecsv/IngestStatus.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/litecsv/IngestStatus.java
@@ -0,0 +1,6 @@
+package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.litecsv;
+
+import java.nio.file.Path;
+
+public record IngestStatus(Path file, long lineCount, int conceptCount, long duration) {
+}