Skip to content

Commit

Permalink
Merge branch 'master' into revertFileSelectionBehaviour
Browse files Browse the repository at this point in the history
# Conflicts:
#	csv-validator-ui/src/main/scala/uk/gov/nationalarchives/csv/validator/ui/CsvValidatorUi.scala
  • Loading branch information
techncl committed Feb 6, 2025
2 parents b1e9695 + c9da827 commit b6b62ec
Show file tree
Hide file tree
Showing 28 changed files with 867 additions and 254 deletions.
66 changes: 47 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,35 +41,63 @@ If you wish to use the CSV Validator from your own Java project, we provide a na
<dependency>
<groupId>uk.gov.nationalarchives</groupId>
<artifactId>csv-validator-java-api</artifactId>
<version>1.3.0</version>
<version>1.4.0</version>
</dependency>
```

The Javadoc, can be found in either Maven Central or you can build it locally by executing `mvn javadoc:javadoc`.

Example Java code of using the CSV Validator through the Java API:
```java
Boolean failFast = false;
List<Substitution> pathSubstitutions = new ArrayList<Substitution>();

List<FailMessage> messages = CsvValidator.validate(
"/data/csv/data.csv",
"/data/csv/data-schema.csvs",
failFast,
pathSubstitutions,
true,
false);
Charset csvEncoding = JCharset.forName("UTF-8"); // default is UTF-8
boolean validateCsvEncoding = true;
Charset csvSchemaEncoding = JCharset.forName("UTF-8"); // default is UTF-8
boolean failFast = true; // default is false
List<Substitution> pathSubstitutions = new ArrayList<Substitution>(); // default is any empty ArrayList
boolean enforceCaseSensitivePathChecks = true; // default is false
boolean trace = false; // default is false
ProgressCallback progress; // default is null
boolean skipFileChecks = true; // default is false
int maxCharsPerCell = 8096; // default is 4096

// add a substitution path
pathSubstitutions.add(new Substitution("file://something", "/home/xxx"));

CsvValidator.ValidatorBuilder validateWithStringNames = new CsvValidator.ValidatorBuilder(
"/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data.csv",
"/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data-schema.csvs"
)

// alternatively, you can pass in Readers for each file
Reader csvReader = new Reader();
Reader csvSchemaReader = new Reader();
CsvValidator.ValidatorBuilder validateWithReaders = new CsvValidator.ValidatorBuilder(
csvReader, csvSchemaReader
)

List<FailMessage> messages = validateWithStringNames
.usingCsvEncoding(csvEncoding, validateCsvEncoding) // should only be `true` if using UTF-8 encoding, otherwise it will throw an exception
.usingCsvSchemaEncoding(csvSchemaEncoding)
.usingFailFast(failFast)
.usingPathSubstitutions(pathSubstitutions)
.usingEnforceCaseSensitivePathChecks(enforceCaseSensitivePathChecks)
.usingTrace(trace)
.usingProgress(progress)
.usingSkipFileChecks(skipFileChecks)
.usingMaxCharsPerCell(maxCharsPerCell)
.runValidation();

if(messages.isEmpty()) {
System.out.println("Completed validation OK");
System.out.println("All worked OK");
} else {
for(FailMessage message : messages) {
if(message instanceof WarningMessage) {
System.out.println("[WARN] " + message.getMessage());
} else {
System.out.println("[ERROR] " + message.getMessage());
}
}
for(FailMessage message : messages) {
if(message instanceof WarningMessage) {
System.out.println("Warning: " + message.getMessage());
} else {
System.out.println("Error: " + message.getMessage());
}
}
}
}
```

Expand Down
4 changes: 3 additions & 1 deletion csv-validator-cmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Designed and tested on both Windows and Linux/Unix/Mac platforms.
Basic Usage
-----------

The following command will show the useage for the application:
The following command will show the usage for the application:

```bash
$ validate
Expand All @@ -32,6 +32,8 @@ Usage: validate [options] <csv-path> <csv-schema-path>
The path to the CSV Schema file to use for validation
--disable-utf8-validation
Disable UTF-8 validation for CSV files
--max-chars-per-cell
Maximum number of chars allowed in cell (is set to 4096 by default)
--skip-file-checks
Skip integrity, checksum and file existence checks
--show-progress
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ object CsvValidatorCmdApp extends App {
csvSchemaPath: Path = Paths.get("."),
csvSchemaEncoding: Charset = CsvValidator.DEFAULT_ENCODING,
disableUtf8Validation:Boolean = false,
maxCharsPerCell: Int = 4096,
progressCallback: Option[ProgressCallback] = None,
skipFileChecks: Boolean = false)

Expand All @@ -70,6 +71,7 @@ object CsvValidatorCmdApp extends App {
opt[Charset]('x', "csv-encoding").optional().action { (x,c) => c.copy(csvEncoding = x) } text("Defines the charset encoding used in the CSV file")
opt[Charset]('y', "csv-schema-encoding").optional().action { (x,c) => c.copy(csvSchemaEncoding = x) }.text("Defines the charset encoding used in the CSV Schema file")
opt[Unit]("disable-utf8-validation").optional().action {(_, c) => c.copy(disableUtf8Validation = true)}.text("Disable UTF-8 validation for CSV files.")
opt[Int]("max-chars-per-cell").optional().action {(x, c) => c.copy(maxCharsPerCell = x)}.text("Maximum number of chars allowed in a cell (is set to 4096 by default)")
opt[Unit]("skip-file-checks").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Skip integrity, checksum and file existence checks")
opt[Unit]("show-progress").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Show progress")
arg[Path]("<csv-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV file: ${x.toString}") }.action { (x,c) => c.copy(csvPath = x) }.text("The path to the CSV file to validate")
Expand All @@ -86,6 +88,7 @@ object CsvValidatorCmdApp extends App {
config.substitutePaths,
config.caseSensitivePaths,
config.traceParser,
config.maxCharsPerCell,
config.progressCallback,
config.skipFileChecks
)
Expand Down Expand Up @@ -143,11 +146,11 @@ object CsvValidatorCmdApp extends App {
case _ =>
}

def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String): List[String] = Try {
val validator = createValidator(true, Nil, false, false, false)
def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String, maxCharsPerCell: Int): List[String] = Try {
val validator = createValidator(true, Nil, false, false, false, maxCharsPerCell)
val csv = validator.loadCsvFile(csvFile, csvSchemaFile)
csv.headOption.map(_.indexOf("identifier")).map { identifierIdx =>
csv.tail.map(arr => arr(identifierIdx))
csv.headOption.map(_.indexOf(columnName)).map { identifierIdx =>
csv.tail.map(row => row(identifierIdx))
}.getOrElse(Nil)
}.getOrElse(Nil)

Expand All @@ -159,11 +162,12 @@ object CsvValidatorCmdApp extends App {
pathSubstitutionsList: List[SubstitutePath],
enforceCaseSensitivePathChecks: Boolean,
trace: Boolean,
maxCharsPerCell: Int,
progress: Option[ProgressCallback],
skipFileChecks: Boolean,
onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
): ExitStatus = {
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks)
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks, maxCharsPerCell)
validator.parseSchema(schemaFile) match {
case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
case Validated.Valid(schema) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ package uk.gov.nationalarchives.csv.validator

import cats.data.{Chain, Validated, ValidatedNel}
import cats.syntax.all._
import com.univocity.parsers.common.TextParsingException
import com.univocity.parsers.csv.{CsvParser, CsvParserSettings}
import org.apache.commons.io.input.BOMInputStream
import uk.gov.nationalarchives.csv.validator.api.TextFile
Expand All @@ -23,7 +24,7 @@ import java.nio.charset.{Charset, StandardCharsets}
import java.nio.file.{Files, Path}
import scala.annotation.tailrec
import scala.language.{postfixOps, reflectiveCalls}
import scala.util.{Try, Using}
import scala.util.{Failure, Success, Try, Using}

//error reporting classes
sealed trait ErrorType
Expand Down Expand Up @@ -61,12 +62,14 @@ trait MetaDataValidator {
def validate(
csv: JReader,
schema: Schema,
maxCharsPerCell: Int = 4096,
progress: Option[ProgressCallback]
): MetaDataValidation[Any] = {
var results: Chain[List[FailMessage]] = Chain.empty
validateReader(
csv,
schema,
maxCharsPerCell,
progress,
{
case Validated.Invalid(x) => results = results :+ x.toList
Expand All @@ -82,6 +85,7 @@ trait MetaDataValidator {
def validateReader(
csv: JReader,
schema: Schema,
maxCharsPerCell: Int,
progress: Option[ProgressCallback],
rowCallback: MetaDataValidation[Any] => Unit
): Boolean = {
Expand All @@ -101,10 +105,10 @@ trait MetaDataValidator {
None
}

validateKnownRows(csv, schema, pf, rowCallback)
validateKnownRows(csv, schema, maxCharsPerCell, pf, rowCallback)
}

def createCsvParser(schema: Schema): CsvParser = {
def createCsvParser(schema: Schema, maxCharsPerCell: Int): CsvParser = {
val separator: Char = schema.globalDirectives.collectFirst {
case Separator(sep) =>
sep
Expand All @@ -124,6 +128,7 @@ trait MetaDataValidator {
settings.setIgnoreLeadingWhitespaces(false)
settings.setIgnoreTrailingWhitespaces(false)
settings.setLineSeparatorDetectionEnabled(true)
settings.setMaxCharsPerColumn(maxCharsPerCell)
// TODO(AR) should we be friendly and auto-detect line separator, or enforce RFC 1480?
format.setQuoteEscape(CSV_RFC1480_QUOTE_ESCAPE_CHARACTER)
//format.setLineSeparator(CSV_RFC1480_LINE_SEPARATOR) // CRLF
Expand All @@ -136,11 +141,12 @@ trait MetaDataValidator {
def validateKnownRows(
csv: JReader,
schema: Schema,
maxCharsPerCell: Int,
progress: Option[ProgressFor],
rowCallback: MetaDataValidation[Any] => Unit
): Boolean = {

val parser = createCsvParser(schema)
val parser = createCsvParser(schema, maxCharsPerCell)

val result : Try[Boolean] = Using {
parser.beginParsing(csv)
Expand All @@ -153,7 +159,7 @@ trait MetaDataValidator {
// if 'no header' is not set and 'permit empty' is not set but the file contains only one line - this is an error


val rowIt = new RowIterator(reader, progress)
val rowIt = new RowIterator(reader, progress, maxCharsPerCell)

val maybeNoData =
if (schema.globalDirectives.contains(NoHeader())) {
Expand Down Expand Up @@ -345,24 +351,34 @@ trait ProgressCallback {
def update(total: Int, processed: Int): Unit = update((processed.toFloat / total.toFloat) * 100)
}

class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iterator[Row] {
class RowIterator(parser: CsvParser, progress: Option[ProgressFor], maxCharsPerCell: Int) extends Iterator[Row] {

private var index = 1
private var current = toRow(Option(parser.parseNext()))
private var current = toRow(Try(parser.parseNext()))
private var potentialHeaderRow: Option[Row] = None

@throws(classOf[IOException])
override def next(): Row = {
val row = current match {
case Some(row) =>
case Success(row) =>
if(index == 1 && potentialHeaderRow.isEmpty) potentialHeaderRow = Some(row) // this is here in case the old API is used that doesn't call 'skipHeader'
row
case None => {
throw new IOException("End of file")
}
case Failure(ex: TextParsingException) if(ex.toString.contains("exceeds the maximum number of characters")) =>
val cellLocationMsg =
potentialHeaderRow match {
case Some(headerRow) => s"in the cell located at line: ${ex.getLineIndex}, column: ${headerRow.cells(ex.getColumnIndex).value},"
case None => s"in column ${ex.getColumnIndex + 1} of the header row"
}

val customMessage =
s"The number of characters $cellLocationMsg is larger than the maximum number of characters allowed in a cell ($maxCharsPerCell); increase this limit and re-run."
throw new Exception(customMessage)
case Failure(ex) => throw ex
}

//move to the next
this.index = index + 1
this.current = toRow(Option(parser.parseNext()))
this.current = toRow(Try(parser.parseNext()))

progress map {
p =>
Expand All @@ -377,10 +393,15 @@ class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iter
@throws(classOf[IOException])
def skipHeader(): Row = {
this.index = index - 1
next()
val row = next()
this.potentialHeaderRow = Some(row)
row
}

override def hasNext: Boolean = current.nonEmpty
override def hasNext: Boolean = current match {
case Failure(ex: NullPointerException) => false
case _ => true
}

private def toRow(rowData: Option[Array[String]]): Option[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
private def toRow(rowData: Try[Array[String]]): Try[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ object CsvValidator {
type PathTo = String
type SubstitutePath = (PathFrom, PathTo)

def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean) = {
def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean, maxCharsPerCellLimit: Int) = {
if(failFast) {
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch}
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit}
} else {
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch }
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit }
}
}
}
Expand Down Expand Up @@ -81,8 +81,8 @@ trait CsvValidator extends SchemaParser {
parseSchema(csvSchemaFile) match {
case Validated.Valid(schema) =>
withReader(csvFile) { reader =>
createCsvParser(schema).parseAll(reader)
}.asScala.toList
createCsvParser(schema, this.maxCharsPerCell).parseAll(reader)
}.asScala.toList
case Validated.Invalid(_) => Nil
}
}
Expand All @@ -100,7 +100,7 @@ trait CsvValidator extends SchemaParser {
val csvValidation = withReader(csvFile) {
reader =>
val totalRows = countRows(csvFile, csvSchema)
validateKnownRows(reader, csvSchema, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
validateKnownRows(reader, csvSchema, this.maxCharsPerCell, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
}
encodingValidationNel.isValid && csvValidation
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ with TraceableParsers {

val skipFileChecks: Boolean

val maxCharsPerCell: Int

lazy val versionHeader: PackratParser[String] = "VersionDecl" ::= ("version" ~> versionLiteral )


Expand Down Expand Up @@ -147,6 +149,7 @@ with TraceableParsers {
val ps = pathSubstitutions
val t = trace
val sfc = skipFileChecks
val mcpc = maxCharsPerCell

SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
version match {
Expand All @@ -155,6 +158,7 @@ with TraceableParsers {
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
override val maxCharsPerCell: Int = mcpc
}

parser1_2.parseVersionAware(reader) match {
Expand All @@ -168,6 +172,7 @@ with TraceableParsers {
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
override val maxCharsPerCell: Int = mcpc
}

parser1_1.parseVersionAware(reader) match {
Expand All @@ -181,6 +186,7 @@ with TraceableParsers {
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
override val maxCharsPerCell: Int = mcpc
}

parser1_0.parseVersionAware(reader) match {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
col1,col2
row1Col1,row1Col2LongCellLength
Loading

0 comments on commit b6b62ec

Please sign in to comment.