-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ALS-7113] Enhance sequential loader
- Loading branch information
Luke Sikina
committed
Feb 13, 2025
1 parent
d6f1ec4
commit f570131
Showing
24 changed files
with
901 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
17 changes: 0 additions & 17 deletions
17
etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...lach/hpds/etl/phenotype/LoadingStore.java → .../dbmi/avillach/hpds/etl/LoadingStore.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 2 additions & 1 deletion
3
...villach/hpds/etl/phenotype/CSVLoader.java → ...ach/hpds/etl/phenotype/csv/CSVLoader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/csv/CSVParserUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.csv; | ||
|
||
import org.apache.commons.csv.CSVRecord; | ||
|
||
import javax.annotation.Nonnull; | ||
import javax.annotation.Nullable; | ||
import java.util.Arrays; | ||
import java.util.Date; | ||
import java.util.stream.Collectors; | ||
|
||
public class CSVParserUtil { | ||
public static final int PATIENT_NUM = 0; | ||
public static final int CONCEPT_PATH = 1; | ||
public static final int NUMERIC_VALUE = 2; | ||
public static final int TEXT_VALUE = 3; | ||
public static final int DATETIME = 4; | ||
|
||
public static String parseConceptPath(CSVRecord record, boolean doVarNameRollup) { | ||
String conceptPathFromRow = record.get(CONCEPT_PATH); | ||
conceptPathFromRow = Arrays.stream(conceptPathFromRow.split("\\\\")) | ||
.map(String::trim) | ||
.collect(Collectors.joining("\\")) + "\\"; | ||
conceptPathFromRow = stripWeirdUnicodeChars(conceptPathFromRow); | ||
|
||
// \\ufffd = � | ||
String textValueFromRow = stripWeirdUnicodeChars(trim(record.get(TEXT_VALUE))); | ||
if (doVarNameRollup && conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\")) { | ||
// This regex deletes the last node from the concept path, i.e. "rolling it up" | ||
return conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\"); | ||
} else { | ||
return conceptPathFromRow; | ||
} | ||
} | ||
|
||
private static String stripWeirdUnicodeChars(@Nonnull String raw) { | ||
return raw.replaceAll("\\ufffd", ""); | ||
} | ||
|
||
public static String trim(@Nullable String maybeString) { | ||
return maybeString == null ? "" : maybeString.trim(); | ||
} | ||
|
||
public static @Nullable Date parseDate(CSVRecord record) { | ||
Date date = null; | ||
try { | ||
if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) { | ||
date = new Date(Long.parseLong(record.get(DATETIME))); | ||
} | ||
} catch (NumberFormatException e) { | ||
return null; | ||
} | ||
|
||
return date; | ||
} | ||
} |
6 changes: 6 additions & 0 deletions
6
etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/litecsv/IngestStatus.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.litecsv; | ||
|
||
import java.nio.file.Path; | ||
|
||
public record IngestStatus(Path file, long lineCount, int conceptCount, long duration) { | ||
} |
Oops, something went wrong.