Merge branch 'master' into revertFileSelectionBehaviour

# Conflicts: # csv-validator-ui/src/main/scala/uk/gov/nationalarchives/csv/validator/ui/CsvValidatorUi.scala
digital-preservation · Feb 6, 2025 · b6b62ec · b6b62ec
2 parents b1e9695 + c9da827
commit b6b62ec
Show file tree

Hide file tree

Showing 28 changed files with 867 additions and 254 deletions.
diff --git a/README.md b/README.md
@@ -41,35 +41,63 @@ If you wish to use the CSV Validator from your own Java project, we provide a na
 <dependency>
 	<groupId>uk.gov.nationalarchives</groupId>
     <artifactId>csv-validator-java-api</artifactId>
-    <version>1.3.0</version>
+    <version>1.4.0</version>
 </dependency>
 ```
 
 The Javadoc, can be found in either Maven Central or you can build it locally by executing `mvn javadoc:javadoc`.
 
 Example Java code of using the CSV Validator through the Java API:
 ```java
-Boolean failFast = false;
-List<Substitution> pathSubstitutions = new ArrayList<Substitution>();
-
-List<FailMessage> messages = CsvValidator.validate(
-"/data/csv/data.csv",
-"/data/csv/data-schema.csvs",
-failFast,
-pathSubstitutions,
-true,
-false);
+ Charset csvEncoding = JCharset.forName("UTF-8"); // default is UTF-8
+ boolean validateCsvEncoding = true;
+ Charset csvSchemaEncoding = JCharset.forName("UTF-8"); // default is UTF-8
+ boolean failFast = true; // default is false
+ List<Substitution> pathSubstitutions = new ArrayList<Substitution>(); // default is any empty ArrayList
+ boolean enforceCaseSensitivePathChecks = true; // default is false
+ boolean trace = false; // default is false
+ ProgressCallback progress; // default is null
+ boolean skipFileChecks = true; // default is false
+ int maxCharsPerCell = 8096; // default is 4096
+
+ // add a substitution path
+ pathSubstitutions.add(new Substitution("file://something", "/home/xxx"));
+
+ CsvValidator.ValidatorBuilder validateWithStringNames = new CsvValidator.ValidatorBuilder(
+     "/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data.csv",
+     "/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data-schema.csvs"
+ )
+
+ // alternatively, you can pass in Readers for each file
+ Reader csvReader = new Reader();
+ Reader csvSchemaReader = new Reader();
+ CsvValidator.ValidatorBuilder validateWithReaders = new CsvValidator.ValidatorBuilder(
+     csvReader, csvSchemaReader
+ )
+
+ List<FailMessage> messages = validateWithStringNames
+   .usingCsvEncoding(csvEncoding, validateCsvEncoding) // should only be `true` if using UTF-8 encoding, otherwise it will throw an exception
+   .usingCsvSchemaEncoding(csvSchemaEncoding)
+   .usingFailFast(failFast)
+   .usingPathSubstitutions(pathSubstitutions)
+   .usingEnforceCaseSensitivePathChecks(enforceCaseSensitivePathChecks)
+   .usingTrace(trace)
+   .usingProgress(progress)
+   .usingSkipFileChecks(skipFileChecks)
+   .usingMaxCharsPerCell(maxCharsPerCell)
+   .runValidation();
 
  if(messages.isEmpty()) {
-	System.out.println("Completed validation OK");
+   System.out.println("All worked OK");
  } else {
- 	for(FailMessage message : messages) {
- 		if(message instanceof WarningMessage) {
- 			System.out.println("[WARN] " + message.getMessage());
- 		} else {
- 			System.out.println("[ERROR] " + message.getMessage());
- 		}
- 	}
+   for(FailMessage message : messages) {
+     if(message instanceof WarningMessage) {
+       System.out.println("Warning: " + message.getMessage());
+     } else {
+       System.out.println("Error: " + message.getMessage());
+     }
+   }
+ }
 }
 ```
 

diff --git a/csv-validator-cmd/README.md b/csv-validator-cmd/README.md
@@ -9,7 +9,7 @@ Designed and tested on both Windows and Linux/Unix/Mac platforms.
 Basic Usage
 -----------
 
-The following command will show the useage for the application:
+The following command will show the usage for the application:
 
 ```bash
 $ validate
@@ -32,6 +32,8 @@ Usage: validate [options] <csv-path> <csv-schema-path>
         The path to the CSV Schema file to use for validation
   --disable-utf8-validation 
          Disable UTF-8 validation for CSV files
+  --max-chars-per-cell
+        Maximum number of chars allowed in cell (is set to 4096 by default)
   --skip-file-checks       
         Skip integrity, checksum and file existence checks
   --show-progress

diff --git a/...tor-cmd/src/main/scala/uk/gov/nationalarchives/csv/validator/cmd/CsvValidatorCmdApp.scala b/...tor-cmd/src/main/scala/uk/gov/nationalarchives/csv/validator/cmd/CsvValidatorCmdApp.scala
@@ -51,6 +51,7 @@ object CsvValidatorCmdApp extends App {
                     csvSchemaPath: Path = Paths.get("."),
                     csvSchemaEncoding: Charset = CsvValidator.DEFAULT_ENCODING,
                     disableUtf8Validation:Boolean = false,
+                    maxCharsPerCell: Int = 4096,
                     progressCallback: Option[ProgressCallback] = None,
                     skipFileChecks: Boolean = false)
 
@@ -70,6 +71,7 @@ object CsvValidatorCmdApp extends App {
         opt[Charset]('x', "csv-encoding").optional().action { (x,c) => c.copy(csvEncoding = x) } text("Defines the charset encoding used in the CSV file")
         opt[Charset]('y', "csv-schema-encoding").optional().action { (x,c) => c.copy(csvSchemaEncoding = x) }.text("Defines the charset encoding used in the CSV Schema file")
         opt[Unit]("disable-utf8-validation").optional().action {(_, c) => c.copy(disableUtf8Validation = true)}.text("Disable UTF-8 validation for CSV files.")
+        opt[Int]("max-chars-per-cell").optional().action {(x, c) =>  c.copy(maxCharsPerCell = x)}.text("Maximum number of chars allowed in a cell (is set to 4096 by default)")
         opt[Unit]("skip-file-checks").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Skip integrity, checksum and file existence checks")
         opt[Unit]("show-progress").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Show progress")
         arg[Path]("<csv-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV file: ${x.toString}") }.action { (x,c) => c.copy(csvPath = x) }.text("The path to the CSV file to validate")
@@ -86,6 +88,7 @@ object CsvValidatorCmdApp extends App {
           config.substitutePaths,
           config.caseSensitivePaths,
           config.traceParser,
+          config.maxCharsPerCell,
           config.progressCallback,
           config.skipFileChecks
         )
@@ -143,11 +146,11 @@ object CsvValidatorCmdApp extends App {
     case _ =>
   }
 
-  def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String): List[String] = Try {
-    val validator = createValidator(true, Nil, false, false, false)
+  def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String, maxCharsPerCell: Int): List[String] = Try {
+    val validator = createValidator(true, Nil, false, false, false, maxCharsPerCell)
     val csv = validator.loadCsvFile(csvFile, csvSchemaFile)
-    csv.headOption.map(_.indexOf("identifier")).map { identifierIdx =>
-      csv.tail.map(arr => arr(identifierIdx))
+    csv.headOption.map(_.indexOf(columnName)).map { identifierIdx =>
+      csv.tail.map(row => row(identifierIdx))
     }.getOrElse(Nil)
   }.getOrElse(Nil)
 
@@ -159,11 +162,12 @@ object CsvValidatorCmdApp extends App {
     pathSubstitutionsList: List[SubstitutePath],
     enforceCaseSensitivePathChecks: Boolean,
     trace: Boolean,
+    maxCharsPerCell: Int,
     progress: Option[ProgressCallback],
     skipFileChecks: Boolean,
     onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
   ): ExitStatus = {
-    val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks)
+    val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks, maxCharsPerCell)
     validator.parseSchema(schemaFile) match {
       case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
       case Validated.Valid(schema) =>

diff --git a/...lidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala b/...lidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala
@@ -11,6 +11,7 @@ package uk.gov.nationalarchives.csv.validator
 
 import cats.data.{Chain, Validated, ValidatedNel}
 import cats.syntax.all._
+import com.univocity.parsers.common.TextParsingException
 import com.univocity.parsers.csv.{CsvParser, CsvParserSettings}
 import org.apache.commons.io.input.BOMInputStream
 import uk.gov.nationalarchives.csv.validator.api.TextFile
@@ -23,7 +24,7 @@ import java.nio.charset.{Charset, StandardCharsets}
 import java.nio.file.{Files, Path}
 import scala.annotation.tailrec
 import scala.language.{postfixOps, reflectiveCalls}
-import scala.util.{Try, Using}
+import scala.util.{Failure, Success, Try, Using}
 
 //error reporting classes
 sealed trait ErrorType
@@ -61,12 +62,14 @@ trait MetaDataValidator {
   def validate(
     csv: JReader,    
     schema: Schema,
+    maxCharsPerCell: Int = 4096,
     progress: Option[ProgressCallback]
   ): MetaDataValidation[Any] = {
     var results: Chain[List[FailMessage]] = Chain.empty
     validateReader(
       csv,
       schema,
+      maxCharsPerCell,
       progress,
       {
         case Validated.Invalid(x) => results = results :+ x.toList
@@ -82,6 +85,7 @@ trait MetaDataValidator {
   def validateReader(
     csv: JReader,
     schema: Schema,
+    maxCharsPerCell: Int,
     progress: Option[ProgressCallback],
     rowCallback: MetaDataValidation[Any] => Unit
   ): Boolean = {
@@ -101,10 +105,10 @@ trait MetaDataValidator {
       None
     }
 
-    validateKnownRows(csv, schema, pf, rowCallback)
+    validateKnownRows(csv, schema, maxCharsPerCell, pf, rowCallback)
   }
 
-  def createCsvParser(schema: Schema): CsvParser = {
+  def createCsvParser(schema: Schema, maxCharsPerCell: Int): CsvParser = {
     val separator: Char = schema.globalDirectives.collectFirst {
       case Separator(sep) =>
         sep
@@ -124,6 +128,7 @@ trait MetaDataValidator {
     settings.setIgnoreLeadingWhitespaces(false)
     settings.setIgnoreTrailingWhitespaces(false)
     settings.setLineSeparatorDetectionEnabled(true)
+    settings.setMaxCharsPerColumn(maxCharsPerCell)
     // TODO(AR) should we be friendly and auto-detect line separator, or enforce RFC 1480?
     format.setQuoteEscape(CSV_RFC1480_QUOTE_ESCAPE_CHARACTER)
     //format.setLineSeparator(CSV_RFC1480_LINE_SEPARATOR)  // CRLF
@@ -136,11 +141,12 @@ trait MetaDataValidator {
   def validateKnownRows(
     csv: JReader,
     schema: Schema,
+    maxCharsPerCell: Int,
     progress: Option[ProgressFor],
     rowCallback: MetaDataValidation[Any] => Unit
   ): Boolean = {
 
-    val parser = createCsvParser(schema)
+    val parser = createCsvParser(schema, maxCharsPerCell)
 
     val result : Try[Boolean] = Using {
       parser.beginParsing(csv)
@@ -153,7 +159,7 @@ trait MetaDataValidator {
         // if 'no header' is not set and 'permit empty' is not set but the file contains only one line - this is an error
 
 
-        val rowIt = new RowIterator(reader, progress)
+        val rowIt = new RowIterator(reader, progress, maxCharsPerCell)
 
         val maybeNoData =
           if (schema.globalDirectives.contains(NoHeader())) {
@@ -345,24 +351,34 @@ trait ProgressCallback {
   def update(total: Int, processed: Int): Unit = update((processed.toFloat / total.toFloat) * 100)
 }
 
-class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iterator[Row] {
+class RowIterator(parser: CsvParser, progress: Option[ProgressFor], maxCharsPerCell: Int) extends Iterator[Row] {
 
   private var index = 1
-  private var current = toRow(Option(parser.parseNext()))
+  private var current = toRow(Try(parser.parseNext()))
+  private var potentialHeaderRow: Option[Row] = None
 
   @throws(classOf[IOException])
   override def next(): Row = {
     val row = current match {
-      case Some(row) =>
+      case Success(row) =>
+        if(index == 1 && potentialHeaderRow.isEmpty) potentialHeaderRow = Some(row) // this is here in case the old API is used that doesn't call 'skipHeader'
         row
-      case None => {
-        throw new IOException("End of file")
-      }
+      case Failure(ex: TextParsingException) if(ex.toString.contains("exceeds the maximum number of characters")) =>
+        val cellLocationMsg =
+          potentialHeaderRow match {
+            case Some(headerRow) => s"in the cell located at line: ${ex.getLineIndex}, column: ${headerRow.cells(ex.getColumnIndex).value},"
+            case None => s"in column ${ex.getColumnIndex + 1} of the header row"
+          }
+
+        val customMessage =
+          s"The number of characters $cellLocationMsg is larger than the maximum number of characters allowed in a cell ($maxCharsPerCell); increase this limit and re-run."
+        throw new Exception(customMessage)
+      case Failure(ex) => throw ex
     }
 
     //move to the next
     this.index = index + 1
-    this.current = toRow(Option(parser.parseNext()))
+    this.current = toRow(Try(parser.parseNext()))
 
     progress map {
       p =>
@@ -377,10 +393,15 @@ class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iter
   @throws(classOf[IOException])
   def skipHeader(): Row = {
     this.index = index - 1
-    next()
+    val row = next()
+    this.potentialHeaderRow = Some(row)
+    row
   }
 
-  override def hasNext: Boolean = current.nonEmpty
+  override def hasNext: Boolean = current match {
+    case Failure(ex: NullPointerException) => false
+    case _ => true
+  }
 
-  private def toRow(rowData: Option[Array[String]]): Option[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
+  private def toRow(rowData: Try[Array[String]]): Try[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
 }
diff --git a/...alidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala b/...alidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala
@@ -30,11 +30,11 @@ object CsvValidator {
   type PathTo = String
   type SubstitutePath = (PathFrom, PathTo)
 
-  def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean) = {
+  def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean, maxCharsPerCellLimit: Int) = {
     if(failFast) {
-      new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch}
+      new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit}
     } else {
-      new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch }
+      new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit }
     }
   }
 }
@@ -81,8 +81,8 @@ trait CsvValidator extends SchemaParser {
     parseSchema(csvSchemaFile) match {
       case Validated.Valid(schema) =>
         withReader(csvFile) { reader =>
-          createCsvParser(schema).parseAll(reader)
-        }.asScala.toList  
+          createCsvParser(schema, this.maxCharsPerCell).parseAll(reader)
+        }.asScala.toList
       case Validated.Invalid(_) => Nil
     }
   }
@@ -100,7 +100,7 @@ trait CsvValidator extends SchemaParser {
     val csvValidation = withReader(csvFile) {
       reader =>
         val totalRows = countRows(csvFile, csvSchema)
-        validateKnownRows(reader, csvSchema, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
+        validateKnownRows(reader, csvSchema, this.maxCharsPerCell, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
     }
     encodingValidationNel.isValid && csvValidation
   }

diff --git a/...dator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala b/...dator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala
@@ -51,6 +51,8 @@ with TraceableParsers {
 
   val skipFileChecks: Boolean
 
+  val maxCharsPerCell: Int
+
   lazy val versionHeader: PackratParser[String] = "VersionDecl" ::= ("version" ~> versionLiteral )
 
 
@@ -147,6 +149,7 @@ with TraceableParsers {
     val ps =  pathSubstitutions
     val t = trace
     val sfc = skipFileChecks
+    val mcpc = maxCharsPerCell
 
     SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
       version match {
@@ -155,6 +158,7 @@ with TraceableParsers {
             override val pathSubstitutions: List[(String, String)] = ps
             override val trace: Boolean = t
             override val skipFileChecks: Boolean = sfc
+            override val maxCharsPerCell: Int = mcpc
           }
 
           parser1_2.parseVersionAware(reader) match {
@@ -168,6 +172,7 @@ with TraceableParsers {
             override val pathSubstitutions: List[(String, String)] = ps
             override val trace: Boolean = t
             override val skipFileChecks: Boolean = sfc
+            override val maxCharsPerCell: Int = mcpc
           }
 
           parser1_1.parseVersionAware(reader) match {
@@ -181,6 +186,7 @@ with TraceableParsers {
             override val pathSubstitutions: List[(String, String)] = ps
             override val trace: Boolean = t
             override val skipFileChecks: Boolean = sfc
+            override val maxCharsPerCell: Int = mcpc
           }
 
           parser1_0.parseVersionAware(reader) match {

diff --git a/.../test/resources/uk/gov/nationalarchives/csv/validator/api/metaDataWithALongCellLength.csv b/.../test/resources/uk/gov/nationalarchives/csv/validator/api/metaDataWithALongCellLength.csv
@@ -0,0 +1,2 @@
+col1,col2
+row1Col1,row1Col2LongCellLength