Skip to content

Commit 78786d2

Browse files
Merge pull request #140 from rhubner/1.1.x
1.1.x
2 parents 02b9d72 + 38ca173 commit 78786d2

File tree

12 files changed

+67
-92
lines changed

12 files changed

+67
-92
lines changed

csv-validator-cmd/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<groupId>uk.gov.nationalarchives</groupId>
66
<artifactId>csv-validator-parent</artifactId>
7-
<version>1.1.4</version>
7+
<version>1.1.5-SNAPSHOT</version>
88
<relativePath>../csv-validator-parent</relativePath>
99
</parent>
1010

@@ -18,7 +18,7 @@
1818
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
1919
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
2020
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
21-
<tag>1.1.4</tag>
21+
<tag>schema-1.1</tag>
2222
</scm>
2323

2424
<build>

csv-validator-core/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<groupId>uk.gov.nationalarchives</groupId>
66
<artifactId>csv-validator-parent</artifactId>
7-
<version>1.1.4</version>
7+
<version>1.1.5-SNAPSHOT</version>
88
<relativePath>../csv-validator-parent</relativePath>
99
</parent>
1010

@@ -18,7 +18,7 @@
1818
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
1919
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
2020
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
21-
<tag>1.1.4</tag>
21+
<tag>schema-1.1</tag>
2222
</scm>
2323

2424
<build>

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,15 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator {
2828
} else {
2929
val row = rows.next()
3030
val result = validateRow(row, schema, Some(rows.hasNext))
31-
validateRows(result :: results)
31+
/*
32+
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
33+
available memory when processing large files.
34+
*/
35+
if (containsErrors(result) || containsWarnings(result)) {
36+
validateRows(result :: results)
37+
} else {
38+
validateRows(results)
39+
}
3240
}
3341
}
3442

@@ -61,4 +69,4 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator {
6169
if(isWarningDirective) toWarnings(ruleResult, row.lineNumber, columnIndex) else toErrors(ruleResult, row.lineNumber, columnIndex)
6270
}}.sequence[MetaDataValidation, Any]
6371
}
64-
}
72+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,22 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
2626

2727
override def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] = {
2828

29-
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
30-
3129
@tailrec
3230
def validateRows(results: List[MetaDataValidation[Any]] = List.empty[MetaDataValidation[Any]]) : List[MetaDataValidation[Any]] = {
3331
if(results.headOption.map(containsErrors(_)).getOrElse(false) || !rows.hasNext) {
3432
results.reverse
3533
} else {
3634
val row = rows.next()
3735
val result = validateRow(row, schema, Some(rows.hasNext))
38-
validateRows(result :: results)
36+
/*
37+
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
38+
available memory when processing large files.
39+
*/
40+
if (containsErrors(result) || containsWarnings(result)) {
41+
validateRows(result :: results)
42+
} else {
43+
validateRows(results)
44+
}
3945
}
4046
}
4147

@@ -90,4 +96,4 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
9096
else if(isWarningDirective) validateAllRulesForCell(columnDefinition.rules)
9197
else validateRulesForCell(columnDefinition.rules)
9298
}
93-
}
99+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 6 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,15 @@ object FailMessage {
4646
}
4747
}
4848

49+
50+
4951
case class ProgressFor(rowsToValidate: Int, progress: ProgressCallback)
5052

5153
trait MetaDataValidator {
54+
// Helper functions for checking if a result contains a warning or error.
55+
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
56+
def containsWarnings(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isWarning).nonEmpty, _ => false)
57+
5258
type MetaDataValidation[S] = ValidationNel[FailMessage, S]
5359

5460
def validate(csv: JReader, schema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = {
@@ -71,66 +77,6 @@ trait MetaDataValidator {
7177
validateKnownRows(csv, schema, pf)
7278
}
7379

74-
/**
75-
* Browse csv File and return all the titleIndex as a list
76-
* @param csv the CSV reader
77-
* @param schema the Schema
78-
* @param columnIndex the index of the column to be return
79-
* @return all the element of the column columnIndex
80-
*/
81-
def getColumn(csv: JReader, schema: Schema, columnIndex: Int): List[String] = {
82-
83-
val separator = schema.globalDirectives.collectFirst {
84-
case Separator(sep) =>
85-
sep
86-
}.getOrElse(CSVParser.DEFAULT_SEPARATOR)
87-
88-
val quote = schema.globalDirectives.collectFirst {
89-
case q: Quoted =>
90-
CSVParser.DEFAULT_QUOTE_CHARACTER
91-
}
92-
93-
//TODO CSVReader does not appear to be RFC 4180 compliant as it does not support escaping a double-quote with a double-quote between double-quotes
94-
//TODO CSVReader does not seem to allow you to enable/disable quoted columns
95-
//we need a better CSV Reader!
96-
(managed(new CSVReader(csv, separator, CSVParser.DEFAULT_QUOTE_CHARACTER, CSVParser.NULL_CHARACTER)) map {
97-
reader =>
98-
// if 'no header' is set but the file is empty and 'permit empty' has not been set - this is an error
99-
// if 'no header' is not set and the file is empty - this is an error
100-
// if 'no header' is not set and 'permit empty' is not set but the file contains only one line - this is an error
101-
102-
val rowIt = new RowIterator(reader, None)
103-
104-
val maybeNoData =
105-
if (schema.globalDirectives.contains(NoHeader())) {
106-
if (!rowIt.hasNext && !schema.globalDirectives.contains(PermitEmpty())) {
107-
Some(FailMessage(ValidationError, "metadata file is empty but this has not been permitted").failureNel[Any])
108-
} else {
109-
None
110-
}
111-
} else {
112-
if(!rowIt.hasNext) {
113-
Some(FailMessage(ValidationError, "metadata file is empty but should contain at least a header").failureNel[Any])
114-
} else {
115-
if(!rowIt.hasNext && !schema.globalDirectives.contains(PermitEmpty())) {
116-
Some(FailMessage(ValidationError, "metadata file has a header but no data and this has not been permitted").failureNel[Any])
117-
} else {
118-
None
119-
}
120-
}
121-
}
122-
123-
maybeNoData match {
124-
case Some(noData) =>
125-
Nil
126-
case None =>
127-
getColumn(rowIt, columnIndex)
128-
129-
}
130-
} opt).getOrElse(Nil)
131-
}
132-
133-
13480
def validateKnownRows(csv: JReader, schema: Schema, progress: Option[ProgressFor]): MetaDataValidation[Any] = {
13581

13682
val separator = schema.globalDirectives.collectFirst {

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/Util.scala

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,12 +218,8 @@ object Util {
218218
}
219219

220220
private def contentDir(filepath: String, topLevelFolder: String): String = {
221-
val list = (substitutePath(filepath) split topLevelFolder).toIterator.toList
222-
val pathList = if (list.length > 1) list.dropRight(1) else list
223-
val dir = pathList.tail.foldLeft(pathList.head){ case (acc, elem) =>
224-
acc + topLevelFolder + elem
225-
}
226-
FileSystem.file2PlatformIndependent(dir + topLevelFolder)
221+
val substPath = substitutePath(filepath).split("/").toSeq.reverse.dropWhile(!_.equals(topLevelFolder)).reverse.mkString("/")
222+
FileSystem.file2PlatformIndependent(substPath)
227223
}
228224

229225
def jointPath: String = {

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import scala.util.Try
1616
import scala.util.parsing.input.Positional
1717
import scalaz._
1818
import scalaz.Scalaz._
19+
import java.util.regex.{Pattern, Matcher}
1920

2021
abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positional {
2122

@@ -75,7 +76,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
7576

7677
def explicitName: Option[String] = explicitColumn.map("$" + _.ref + "/")
7778

78-
7979
def ruleName: String = explicitName.getOrElse("") + name
8080

8181
def columnIdentifierToIndex(schema: Schema, id: ColumnIdentifier): Int = {
@@ -89,7 +89,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
8989
}
9090

9191

92-
9392
def toValueError(row: Row, columnIndex:Int ) =
9493
s"""value: ${'"'}${row.cells(columnIndex).value}${'"'}"""
9594

@@ -99,8 +98,30 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
9998

10099
}
101100

101+
/**
102+
* This object is a place to store the precompiled regexs
103+
* @author Jess Flanagan
104+
*/
105+
object RegexCache {
106+
val cache = collection.mutable.Map[String, Pattern]()
107+
108+
/**
109+
* This function returns compiled regexs.
110+
* First we check to see if its already in the cache. Otherwise we compile it, add it to the cache and return the
111+
* compiled version. This results in a significant speed up for processing large files.
112+
* @param pattern A regex pattern string.
113+
*
114+
* @return A compiled representation of a regular expression.
115+
* @author Jess Flanagan
116+
*/
117+
def getCompiledRegex(pattern: String): Pattern = cache.getOrElseUpdate(pattern, Pattern.compile(pattern))
118+
}
119+
102120
abstract class PatternRule(name: String, pattern: String) extends Rule(name) {
103-
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = cellValue matches pattern
121+
// Uses the cache to retrieve a compiled regex representation for the pattern string.
122+
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = {
123+
RegexCache.getCompiledRegex(pattern).matcher(cellValue).matches()
124+
}
104125
}
105126

106127
trait DateParser {
@@ -114,4 +135,4 @@ abstract class DateRule(name: String, dateRegex: String, dateParser: DateParser)
114135
case _ => false
115136
}
116137
}
117-
}
138+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_0/Rule.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import uk.gov.nationalarchives.csv.validator.Util.{FileSystem, TypedPath}
1717
import uk.gov.nationalarchives.csv.validator.api.CsvValidator._
1818
import uk.gov.nationalarchives.csv.validator.metadata.Row
1919
import uk.gov.nationalarchives.csv.validator.schema._
20-
20+
import java.util.regex.{Pattern, Matcher}
2121
import scala.annotation.tailrec
2222
import scala.collection.mutable
2323
import scala.util.Try
@@ -92,12 +92,10 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul
9292
}
9393
}
9494

95-
9695
case class RegExpRule(regex: String) extends Rule("regex") {
9796
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = {
98-
9997
val regexp = if (columnDefinition.directives.contains(IgnoreCase())) "(?i)" + regex else regex
100-
cellValue matches regexp
98+
RegexCache.getCompiledRegex(regexp).matcher(cellValue).matches()
10199
}
102100

103101
override def toError = {

csv-validator-java-api/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<groupId>uk.gov.nationalarchives</groupId>
66
<artifactId>csv-validator-parent</artifactId>
7-
<version>1.1.4</version>
7+
<version>1.1.5-SNAPSHOT</version>
88
<relativePath>../csv-validator-parent</relativePath>
99
</parent>
1010

@@ -18,7 +18,7 @@
1818
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
1919
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
2020
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
21-
<tag>1.1.4</tag>
21+
<tag>schema-1.1</tag>
2222
</scm>
2323

2424
<build>

csv-validator-parent/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
<groupId>uk.gov.nationalarchives</groupId>
1313
<artifactId>csv-validator-parent</artifactId>
14-
<version>1.1.4</version>
14+
<version>1.1.5-SNAPSHOT</version>
1515
<packaging>pom</packaging>
1616

1717
<name>csv-validator-parent</name>
@@ -77,7 +77,7 @@
7777
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
7878
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
7979
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
80-
<tag>1.1.4</tag>
80+
<tag>schema-1.1</tag>
8181
</scm>
8282

8383
<properties>

csv-validator-ui/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<groupId>uk.gov.nationalarchives</groupId>
66
<artifactId>csv-validator-parent</artifactId>
7-
<version>1.1.4</version>
7+
<version>1.1.5-SNAPSHOT</version>
88
<relativePath>../csv-validator-parent</relativePath>
99
</parent>
1010

@@ -18,7 +18,7 @@
1818
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
1919
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
2020
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
21-
<tag>1.1.4</tag>
21+
<tag>schema-1.1</tag>
2222
</scm>
2323

2424
<build>

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>uk.gov.nationalarchives</groupId>
77
<artifactId>csv-validator-parent</artifactId>
8-
<version>1.1.4</version>
8+
<version>1.1.5-SNAPSHOT</version>
99
<relativePath>csv-validator-parent</relativePath>
1010
</parent>
1111

@@ -19,7 +19,7 @@
1919
<connection>scm:git:https://github.com/digital-preservation/csv-validator.git</connection>
2020
<developerConnection>scm:git:https://github.com/digital-preservation/csv-validator.git</developerConnection>
2121
<url>scm:git:https://github.com/digital-preservation/csv-validator.git</url>
22-
<tag>1.1.4</tag>
22+
<tag>schema-1.1</tag>
2323
</scm>
2424

2525
<modules>

0 commit comments

Comments
 (0)