Skip to content

Commit

Permalink
[ALS-7113] Enhance sequential loader
Browse files Browse the repository at this point in the history
  • Loading branch information
Luke Sikina committed Feb 13, 2025
1 parent d6f1ec4 commit f570131
Show file tree
Hide file tree
Showing 24 changed files with 901 additions and 83 deletions.
1 change: 1 addition & 0 deletions docker/pic-sure-hpds-etl/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ COPY --from=build /app/docker/pic-sure-hpds-etl/RekeyDataset-jar-with-dependenci
COPY --from=build /app/docker/pic-sure-hpds-etl/RemoveConceptFromMetadata-jar-with-dependencies.jar .
COPY --from=build /app/docker/pic-sure-hpds-etl/HideAnnotationCategoryValue-jar-with-dependencies.jar .
COPY --from=build /app/docker/pic-sure-hpds-etl/SequentialLoader-jar-with-dependencies.jar .
COPY --from=build /app/docker/pic-sure-hpds-etl/LowRAMMultiCSVLoader-jar-with-dependencies.jar .
COPY --from=build /app/docker/pic-sure-hpds-etl/create_key.sh .

ENTRYPOINT ["sh", "-c", "java $JAVA_OPTS -Xmx${HEAPSIZE:-2048}m -jar ${LOADER_NAME:-CSVLoader}-jar-with-dependencies.jar"]
28 changes: 24 additions & 4 deletions etl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@
<configuration>
<archive>
<manifest>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.CSVLoaderNewSearch</mainClass>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv.CSVLoaderNewSearch</mainClass>
</manifest>
</archive>
<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
Expand All @@ -155,7 +155,7 @@
<configuration>
<archive>
<manifest>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.CSVLoader</mainClass>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv.CSVLoader</mainClass>
</manifest>
</archive>
<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
Expand All @@ -175,7 +175,7 @@
<configuration>
<archive>
<manifest>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.SQLLoader</mainClass>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.sql.SQLLoader</mainClass>
</manifest>
</archive>
<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
Expand All @@ -195,7 +195,7 @@
<configuration>
<archive>
<manifest>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.SequentialLoader</mainClass>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.sequential.SequentialLoader</mainClass>
</manifest>
</archive>
<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
Expand All @@ -210,6 +210,26 @@
<goal>single</goal>
</goals>
</execution>
<execution>
<id>buildLiteCSVLoader</id>
<configuration>
<archive>
<manifest>
<mainClass>edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.litecsv.LowRAMMultiCSVLoader</mainClass>
</manifest>
</archive>
<outputDirectory>${project.basedir}/../docker/pic-sure-hpds-etl</outputDirectory>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<classifier>LowRAMMultiCSVLoader</classifier>
<finalName>LowRAMMultiCSVLoader</finalName>
</configuration>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
<execution>
<id>buildMultialleleCounter</id>
<configuration>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,39 +1,22 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype.util;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.concurrent.ExecutionException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.ImmutableList;

import de.siegmar.fastcsv.reader.CsvContainer;
import de.siegmar.fastcsv.reader.CsvReader;
import de.siegmar.fastcsv.reader.CsvRow;
import de.siegmar.fastcsv.writer.CsvWriter;
import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;

public class RemapPatientIds {
protected static LoadingCache<String, PhenoCube<?>> store;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;

public class DumpSourceCSV {
protected static LoadingCache<String, PhenoCube<?>> store;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;

@SuppressWarnings({"unchecked", "rawtypes"})
public class FixCategoricalConcepts {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;

@SuppressWarnings({"unchecked"})
public class RekeyDataset {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.KeyAndValue;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;

@SuppressWarnings({"unchecked", "rawtypes"})
public class RemapIds {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.LoadingStore;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;

public class RenameCategories {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
package edu.harvard.hms.dbmi.avillach.hpds.etl;

import java.io.*;
import java.nio.file.Files;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;

import java.io.*;
import java.util.Date;
import java.util.concurrent.ExecutionException;

import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype;
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;

import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto;
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube;
import edu.harvard.hms.dbmi.avillach.hpds.etl.LoadingStore;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
Expand All @@ -17,16 +18,6 @@ public class CSVLoaderNewSearch {

private static final Logger log = LoggerFactory.getLogger(CSVLoaderNewSearch.class);

private static final int PATIENT_NUM = 0;

private static final int CONCEPT_PATH = 1;

private static final int NUMERIC_VALUE = 2;

private static final int TEXT_VALUE = 3;

private static final int DATETIME = 4;

private static boolean DO_VARNAME_ROLLUP = false;

private static final String HPDS_DIRECTORY = "/opt/local/hpds/";
Expand Down Expand Up @@ -56,24 +47,24 @@ private static void initialLoad() throws IOException {

private static void processRecord(final PhenoCube[] currentConcept, CSVRecord record) {
if (record.size() < 4) {
log.info("Record number {} had less records than we expected so we are skipping it.", record.getRecordNumber());
log.info("Record number {} had less records than we exgpected so we are skipping it.", record.getRecordNumber());
return;
}

String conceptPath = getSanitizedConceptPath(record);
String numericValue = record.get(NUMERIC_VALUE);
String conceptPath = CSVParserUtil.parseConceptPath(record, DO_VARNAME_ROLLUP);
String numericValue = record.get(CSVParserUtil.NUMERIC_VALUE);
boolean isAlpha = (numericValue == null || numericValue.isEmpty());
String value = isAlpha ? record.get(TEXT_VALUE) : numericValue;
String value = isAlpha ? record.get(CSVParserUtil.TEXT_VALUE) : numericValue;
currentConcept[0] = getPhenoCube(currentConcept[0], conceptPath, isAlpha);

if (value != null && !value.trim().isEmpty() &&
((isAlpha && currentConcept[0].vType == String.class) || (!isAlpha && currentConcept[0].vType == Double.class))) {
value = value.trim();
currentConcept[0].setColumnWidth(isAlpha ? Math.max(currentConcept[0].getColumnWidth(), value.getBytes().length) : Double.BYTES);
int patientId = Integer.parseInt(record.get(PATIENT_NUM));
int patientId = Integer.parseInt(record.get(CSVParserUtil.PATIENT_NUM));
Date date = null;
if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) {
date = new Date(Long.parseLong(record.get(DATETIME)));
if (record.size() > 4 && record.get(CSVParserUtil.DATETIME) != null && !record.get(CSVParserUtil.DATETIME).isEmpty()) {
date = new Date(Long.parseLong(record.get(CSVParserUtil.DATETIME)));
}
currentConcept[0].add(patientId, isAlpha ? value : Double.parseDouble(value), date);
store.allIds.add(patientId);
Expand All @@ -91,28 +82,4 @@ private static PhenoCube getPhenoCube(PhenoCube currentConcept, String conceptPa

return currentConcept;
}

private static String getSanitizedConceptPath(CSVRecord record) {
String conceptPathFromRow = record.get(CONCEPT_PATH);
String[] segments = conceptPathFromRow.split("\\\\");
for (int x = 0; x < segments.length; x++) {
segments[x] = segments[x].trim();
}
conceptPathFromRow = String.join("\\", segments) + "\\";
conceptPathFromRow = conceptPathFromRow.replaceAll("\\ufffd", "");
String textValueFromRow = record.get(TEXT_VALUE) == null ? null : record.get(TEXT_VALUE).trim();
if (textValueFromRow != null) {
textValueFromRow = textValueFromRow.replaceAll("\\ufffd", "");
}
String conceptPath;

if (DO_VARNAME_ROLLUP) {
conceptPath = conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\") ? conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\") : conceptPathFromRow;
} else {
conceptPath = conceptPathFromRow;
}
return conceptPath;
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv;

import org.apache.commons.csv.CSVRecord;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.Date;
import java.util.stream.Collectors;

public class CSVParserUtil {
public static final int PATIENT_NUM = 0;
public static final int CONCEPT_PATH = 1;
public static final int NUMERIC_VALUE = 2;
public static final int TEXT_VALUE = 3;
public static final int DATETIME = 4;

public static String parseConceptPath(CSVRecord record, boolean doVarNameRollup) {
String conceptPathFromRow = record.get(CONCEPT_PATH);
conceptPathFromRow = Arrays.stream(conceptPathFromRow.split("\\\\"))
.map(String::trim)
.collect(Collectors.joining("\\")) + "\\";
conceptPathFromRow = stripWeirdUnicodeChars(conceptPathFromRow);

// \\ufffd = �
String textValueFromRow = stripWeirdUnicodeChars(trim(record.get(TEXT_VALUE)));
if (doVarNameRollup && conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\")) {
// This regex deletes the last node from the concept path, i.e. "rolling it up"
return conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\");
} else {
return conceptPathFromRow;
}
}

private static String stripWeirdUnicodeChars(@Nonnull String raw) {
return raw.replaceAll("\\ufffd", "");
}

public static String trim(@Nullable String maybeString) {
return maybeString == null ? "" : maybeString.trim();
}

public static @Nullable Date parseDate(CSVRecord record) {
Date date = null;
try {
if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) {
date = new Date(Long.parseLong(record.get(DATETIME)));
}
} catch (NumberFormatException e) {
return null;
}

return date;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.litecsv;

import java.nio.file.Path;

public record IngestStatus(Path file, long lineCount, int conceptCount, long duration) {
}
Loading

0 comments on commit f570131

Please sign in to comment.