-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ALS-7113] Enhance sequential loader
- Loading branch information
Luke Sikina
committed
Feb 13, 2025
1 parent
d6f1ec4
commit 40cb814
Showing
11 changed files
with
776 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/CSVParserUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype; | ||
|
||
import org.apache.commons.csv.CSVRecord; | ||
|
||
import javax.annotation.Nonnull; | ||
import javax.annotation.Nullable; | ||
import java.util.Arrays; | ||
import java.util.Date; | ||
import java.util.stream.Collectors; | ||
|
||
public class CSVParserUtil { | ||
public static final int PATIENT_NUM = 0; | ||
public static final int CONCEPT_PATH = 1; | ||
public static final int NUMERIC_VALUE = 2; | ||
public static final int TEXT_VALUE = 3; | ||
public static final int DATETIME = 4; | ||
|
||
public static String parseConceptPath(CSVRecord record, boolean doVarNameRollup) { | ||
String conceptPathFromRow = record.get(CONCEPT_PATH); | ||
conceptPathFromRow = Arrays.stream(conceptPathFromRow.split("\\\\")) | ||
.map(String::trim) | ||
.collect(Collectors.joining("\\")) + "\\"; | ||
conceptPathFromRow = stripWeirdUnicodeChars(conceptPathFromRow); | ||
|
||
// \\ufffd = � | ||
String textValueFromRow = stripWeirdUnicodeChars(trim(record.get(TEXT_VALUE))); | ||
if (doVarNameRollup && conceptPathFromRow.endsWith("\\" + textValueFromRow + "\\")) { | ||
// This regex deletes the last node from the concept path, i.e. "rolling it up" | ||
return conceptPathFromRow.replaceAll("\\\\[^\\\\]*\\\\$", "\\\\"); | ||
} else { | ||
return conceptPathFromRow; | ||
} | ||
} | ||
|
||
private static String stripWeirdUnicodeChars(@Nonnull String raw) { | ||
return raw.replaceAll("\\ufffd", ""); | ||
} | ||
|
||
public static String trim(@Nullable String maybeString) { | ||
return maybeString == null ? "" : maybeString.trim(); | ||
} | ||
|
||
public static @Nullable Date parseDate(CSVRecord record) { | ||
Date date = null; | ||
try { | ||
if (record.size() > 4 && record.get(DATETIME) != null && !record.get(DATETIME).isEmpty()) { | ||
date = new Date(Long.parseLong(record.get(DATETIME))); | ||
} | ||
} catch (NumberFormatException e) { | ||
return null; | ||
} | ||
|
||
return date; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
151 changes: 151 additions & 0 deletions
151
etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/seqcsv/CSVProcessor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
package edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.seqcsv; | ||
|
||
import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube; | ||
import edu.harvard.hms.dbmi.avillach.hpds.etl.phenotype.CSVParserUtil; | ||
import org.apache.commons.csv.CSVFormat; | ||
import org.apache.commons.csv.CSVParser; | ||
import org.apache.commons.csv.CSVRecord; | ||
import org.apache.logging.log4j.util.Strings; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import javax.annotation.Nullable; | ||
import java.io.*; | ||
import java.nio.file.Files; | ||
import java.util.*; | ||
import java.util.stream.Stream; | ||
|
||
public class CSVProcessor { | ||
|
||
private static final Logger log = LoggerFactory.getLogger(CSVProcessor.class); | ||
private final LowRAMLoadingStore store; | ||
private final boolean doVarNameRollup; | ||
private final double maxChunkSizeGigs; | ||
|
||
public CSVProcessor(LowRAMLoadingStore store, boolean doVarNameRollup, double maxChunkSizeGigs) { | ||
this.store = store; | ||
this.doVarNameRollup = doVarNameRollup; | ||
this.maxChunkSizeGigs = maxChunkSizeGigs; | ||
} | ||
|
||
public IngestStatus process(File csv) { | ||
long startTime = System.nanoTime(); | ||
log.info("Attempting to ingest file {}", csv.getAbsolutePath()); | ||
try (Reader r = new FileReader(csv); Stream<String> rawLines = Files.lines(csv.toPath())) { | ||
CSVParser parser = CSVFormat.DEFAULT | ||
.withSkipHeaderRecord() | ||
.withFirstRecordAsHeader() | ||
.parse(new BufferedReader(r)); | ||
|
||
// we want to read the file in reasonably sized chunks so that we can handle chunks naively | ||
// in memory without going OOM | ||
// to do this, we're going to assume that over the course of thousands of lines, each line | ||
// is more or less the same length | ||
// so we'll just provision n chunks, where n is max(1, file_size/5G) | ||
log.info("Gathering stats about file {}", csv.getName()); | ||
int chunks = Math.max(1, (int)Math.ceil((double) csv.length() / (maxChunkSizeGigs*1024*1024*1024))); | ||
final long totalLineCount = rawLines.count(); | ||
final long linesPerChunk = totalLineCount / chunks; | ||
log.info( | ||
"File {} is {} bytes and {} lines. Dividing into {} chunks of {} lines each", | ||
csv.getName(), csv.length(), totalLineCount, chunks, linesPerChunk | ||
); | ||
long chunkLineCount = 0; | ||
long lineCount = 0; | ||
int chunkCount = 0; | ||
Set<String> concepts = new HashSet<>(); | ||
List<CSVRecord> lines = new ArrayList<>(); | ||
|
||
log.info("Creating chunks"); | ||
for (CSVRecord record : parser) { | ||
chunkLineCount++; | ||
lineCount++; | ||
lines.add(record); | ||
if (chunkLineCount > linesPerChunk || lineCount + 1 == totalLineCount ) { | ||
log.info("Finished creating chunk {}", chunkCount); | ||
// sort by concept to prevent cache thrashing when ingesting | ||
// loading each concept in its entirety for a chunk will minimize disk IO and | ||
// let us keep more valuable things in RAM | ||
lines.sort(Comparator.comparing(a -> a.get(1))); | ||
log.info("Finished sorting chunk {}", chunkCount); | ||
Set<String> chunkConcepts = ingest(lines); | ||
concepts.addAll(chunkConcepts); | ||
log.info("Finished ingesting chunk {} with {} unique concepts", chunkCount, chunkConcepts.size()); | ||
lines = new ArrayList<>(); | ||
chunkLineCount = 0; | ||
chunkCount++; | ||
} | ||
} | ||
|
||
return new IngestStatus( | ||
csv.toPath(), | ||
totalLineCount, | ||
concepts.size(), | ||
System.nanoTime() - startTime | ||
); | ||
|
||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
|
||
private Set<String> ingest(List<CSVRecord> sortedRecords) { | ||
Set<String> concepts = new HashSet<>(); | ||
for (CSVRecord record : sortedRecords) { | ||
if (record.size() < 4) { | ||
log.info("Record #{} has too few columns, skipping.", record.getRecordNumber()); | ||
continue; | ||
} | ||
|
||
String conceptPath = CSVParserUtil.parseConceptPath(record, doVarNameRollup); | ||
IngestFn ingestFn = Strings.isEmpty(record.get(CSVParserUtil.NUMERIC_VALUE)) ? this::ingestNonNumeric : this::ingestNumeric; | ||
Date date = CSVParserUtil.parseDate(record); | ||
int patientId = Integer.parseInt(record.get(CSVParserUtil.PATIENT_NUM)); | ||
if (ingestFn.attemptIngest(record, conceptPath, patientId, date)) { | ||
concepts.add(conceptPath); | ||
} else { | ||
log.warn("Could not ingest record #{}", record.getRecordNumber()); | ||
} | ||
} | ||
return concepts; | ||
} | ||
|
||
@FunctionalInterface | ||
private interface IngestFn { | ||
boolean attemptIngest(CSVRecord record, String path, int patientId, @Nullable Date date); | ||
} | ||
|
||
private boolean ingestNumeric(CSVRecord record, String conceptPath, int patientId, Date date) { | ||
PhenoCube<Double> concept = store.loadingCache.getIfPresent(conceptPath); | ||
if (concept == null) { | ||
concept = new PhenoCube<>(conceptPath, Double.class); | ||
concept.setColumnWidth(Double.BYTES); | ||
store.loadingCache.put(conceptPath, concept); | ||
} | ||
try { | ||
String rawNumericValue = CSVParserUtil.trim(record.get(CSVParserUtil.NUMERIC_VALUE)); | ||
double parsedValue = Double.parseDouble(rawNumericValue); | ||
concept.add(patientId, parsedValue, date); | ||
return true; | ||
} catch (NumberFormatException e) { | ||
log.warn("Could not parse numeric value in line {}", record); | ||
} | ||
return false; | ||
} | ||
|
||
private boolean ingestNonNumeric(CSVRecord record, String conceptPath, int patientId, Date date) { | ||
PhenoCube<String> concept = store.loadingCache.getIfPresent(conceptPath); | ||
if (concept == null) { | ||
concept = new PhenoCube<>(conceptPath, String.class); | ||
store.loadingCache.put(conceptPath, concept); | ||
} | ||
String rawTextValue = CSVParserUtil.trim(record.get(CSVParserUtil.TEXT_VALUE)); | ||
if (rawTextValue.isEmpty()) { | ||
return false; | ||
} | ||
concept.setColumnWidth(Math.max(rawTextValue.getBytes().length, concept.getColumnWidth())); | ||
concept.add(patientId, rawTextValue, date); | ||
return true; | ||
} | ||
} |
Oops, something went wrong.