Skip to content

Commit b6ff075

Browse files
committed
Merge pull request #117 from rhubner/url-decode
Url decode
2 parents 4726be4 + aa952fc commit b6ff075

File tree

17 files changed

+275
-7
lines changed

17 files changed

+275
-7
lines changed

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import uk.gov.nationalarchives.csv.validator.schema.{Schema, SchemaParser}
1212
import scalaz._, Scalaz._
1313
import scalax.file.Path
1414
import uk.gov.nationalarchives.csv.validator._
15-
import java.io.{Reader => JReader, File}
16-
import java.nio.charset.{Charset => JCharset}
15+
import _root_.java.io.{Reader => JReader, File}
16+
import _root_.java.nio.charset.{Charset => JCharset}
1717

1818
object CsvValidator {
1919

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import util.parsing.input.Positional
1515
case class Schema(globalDirectives: List[GlobalDirective], columnDefinitions: List[ColumnDefinition], version: String = Schema.version)
1616

1717
object Schema {
18-
val version = "1.1"
18+
val version = "1.2"
1919
}
2020

2121
abstract class GlobalDirective(val name: String) extends Positional

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ package uk.gov.nationalarchives.csv.validator.schema
1010

1111
import uk.gov.nationalarchives.csv.validator.schema.v1_0.{SchemaParser => SchemaParser1_0}
1212
import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1, _}
13+
import uk.gov.nationalarchives.csv.validator.schema.v1_2.{SchemaParser => SchemaParser1_2, _}
1314

1415
import scala.util.parsing.combinator._
1516
import scala.language.reflectiveCalls
@@ -147,6 +148,18 @@ with TraceableParsers {
147148

148149
SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
149150
version match {
151+
case "1.2" =>
152+
val parser1_2 = new SchemaParser1_2 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
153+
override val pathSubstitutions: List[(String, String)] = ps
154+
override val trace: Boolean = t
155+
}
156+
157+
parser1_2.parseVersionAware(reader) match {
158+
case parser1_2.Success(s, n) => Success(s, n)
159+
case parser1_2.Failure(msg, n) => Failure(msg, n)
160+
case parser1_2.Error(msg, n) => Error(msg, n)
161+
}
162+
150163
case "1.1" =>
151164
val parser1_1 = new SchemaParser1_1 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
152165
override val pathSubstitutions: List[(String, String)] = ps
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/**
2+
* Copyright (c) 2013, The National Archives <[email protected]>
3+
* http://www.nationalarchives.gov.uk
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
package uk.gov.nationalarchives.csv.validator.schema.v1_2
10+
11+
import uk.gov.nationalarchives.csv.validator.metadata.Row
12+
import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider}
13+
import java.net.{URLDecoder => JURLDecoder}
14+
15+
case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
16+
17+
val DefaultCharset = "UTF-8"
18+
19+
override def referenceValue(columnIndex: Int, row: Row, schema: Schema): Option[String] = value.referenceValue(columnIndex, row, schema).map( value => {
20+
21+
val codepage = charset.flatMap(x => x.referenceValue(columnIndex, row, schema)).getOrElse(DefaultCharset)
22+
23+
JURLDecoder.decode(value,codepage)
24+
25+
})
26+
27+
override def toError: String = "uriDecode(" + value.toError + ")"
28+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/**
2+
* Copyright (c) 2013, The National Archives <[email protected]>
3+
* http://www.nationalarchives.gov.uk
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
package uk.gov.nationalarchives.csv.validator.schema.v1_2
10+
11+
import scala.language.reflectiveCalls
12+
import uk.gov.nationalarchives.csv.validator.schema.{Literal, ArgProvider}
13+
import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1}
14+
15+
trait SchemaParser extends SchemaParser1_1 {
16+
17+
/**
18+
* [59] StringProvider ::= ColumnRef | StringLiteral
19+
*/
20+
override lazy val stringProvider: PackratParser[ArgProvider] = "StringProvider" ::= noext | concat | urlDecode | columnRef | stringLiteral ^^ {
21+
s => Literal(Some(s))
22+
}
23+
24+
lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {
25+
case value ~ charset => UriDecode(value, charset)
26+
}
27+
28+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
version 1.2
2+
@totalColumns 2 @noHeader
3+
identifier:
4+
filename: in(uriDecode($identifier))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
file:/some/folder/some%21file.txt,some file.txt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
file:/some/folder/some%20file.txt,some file.txt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
version 1.2
2+
@totalColumns 3 @noHeader
3+
identifier:
4+
filename: in(uriDecode($identifier, $charset))
5+
charset:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
file:/some/folder/some%20file.txt,some file.txt,UTF-8
2+
file:/some/folder/text%9Atext.txt,textštext.txt,windows-1252
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
version 1.2
2+
@totalColumns 33
3+
/*---------------------------------------------------------------------------------------------------------------
4+
|This schema is for the validation of technical acquisition metadata |
5+
|csv files according to the specification given for digitised surrogates in |
6+
|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
7+
|This version is for WO 95 digitisation in the period 2014-15 |
8+
| 20140818 Version 1.0 DHU First release version for this project |
9+
| 20140910 Version 1.1 DHU Updated date regex to fix issues, allowed items up to 14, disallow fullstops |
10+
|at end of description as this causes search issues in Discovery. |
11+
| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status |
12+
|and held_by fields, changed date column to covering_date |
13+
| 20141110 version 1.3 NW fixed sub_sub_series rule |
14+
|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is("")) |
15+
|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) |
16+
| 20160511 - RH - Update schema version to test CSV validator backward compatibility |
17+
---------------------------------------------------------------------------------------------------------------*/
18+
/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema
19+
is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
20+
and that there are 31 columns in total in the file.*/
21+
batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$")
22+
//1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined
23+
//by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last
24+
//three digits are running number for batches throughout the project.
25+
department: is("WO") and (in($file_path) and in($resource_uri))
26+
//Parentheses control evaluation order of booleans as might be expected
27+
//Department is fixed value of WO for this project.
28+
//The grouped "in" statements say that the value found in this field must also be found as part of the fields
29+
//"file_path" and "resource_uri"
30+
division: is("13")
31+
//this field must be precisely 13
32+
series: is("95") and (in($file_path) and in($resource_uri))
33+
//Fixed value of 95 for this project
34+
//The value must also be part of the fields "file_path" and "resource_uri"
35+
sub_series: is("1")
36+
//For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
37+
sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
38+
//As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
39+
//Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
40+
piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri))
41+
//For this project there is a defined relationship between piece ranges as listed in Appendix E
42+
//This is encapsulated in this rather complex if,then,else statement
43+
//The value must also be part of the fields "file_path" and "resource_uri"
44+
item: (range(1,20) and in($file_path)) or is("")
45+
//Most pieces are subdivided into items, there are not expected to be more than 10 per piece
46+
//The value must also be part of the fields "file_path" and "resource_uri"
47+
//In many cases the item level is not used, so this would be left blank.
48+
//as the sorting/cataloguing process advances this condition may be tightened
49+
ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal)
50+
//the ordinal is a simple running count of the images within an item (or piece if not itemised).
51+
//No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
52+
//This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
53+
//the combination of fields indicated should be unique within the file
54+
description: not("") and regex("^.*[^\.]$")
55+
//description is a fairly free-form field, but must not be empty
56+
covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$")
57+
//dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
58+
legal_status: is("Public Record")
59+
held_by: is("The National Archives, Kew")
60+
file_uuid: uuid4 unique
61+
//must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case.
62+
file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder")
63+
//fileExists checks that there is actually a file of the given name at the specified location on the file system.
64+
//In practice, the validator will normally be run with the --path switch
65+
//(see http://digital-preservation.github.io/csv-validator/)
66+
//We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch
67+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
68+
//content of this field)
69+
//must be unique within the file
70+
file_checksum: unique checksum(file($file_path),"SHA-256")
71+
//Compare the value given in this field to the checksum calculated for the file found at the location given in
72+
//the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
73+
//Use the specified checksum algorithm (must use lowercase hex characters).
74+
//unique within the file - an identical checksum would imply identical images
75+
resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique
76+
//Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
77+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
78+
//content of this field)
79+
scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
80+
//12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
81+
//restricted to the scanning company to avoid personally identifying data being held in the file
82+
scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
83+
//Like "scan_operator", but this code represents the actual scanner or camera used
84+
scan_location: regex("[-\w\s,.]+")
85+
//Address or other description of the location where scanning physically occurred. The regex allows any number
86+
//of characters, allows general word and whitespace characters plus hyphen, comma and full stop
87+
image_resolution: positiveInteger (is("300") or is("600"))
88+
//Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
89+
//Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
90+
//eg range(298,302) to capture slight variances in resolution.
91+
image_width: positiveInteger
92+
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
93+
image_height: positiveInteger
94+
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
95+
image_tonal_resolution: is("24-bit colour")
96+
//must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified.
97+
image_format: is("x-fmt/392")
98+
//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
99+
//(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
100+
image_compression: positiveInteger is("6")
101+
//Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
102+
//available in the JPEG2000 specification
103+
image_colour_space: is("sRGB")
104+
//must be string: sRGB (precisely - case as shown)
105+
image_split: is("yes") or is("no") or is("composite")
106+
//must be string: yes; or string: no or string: composite (precisely - case as shown). Used if eg an image of complete double page
107+
//subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images
108+
image_split_ordinal: if($image_split/is("composite"),range(1,9),is(""))
109+
//describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite.
110+
//9 is expected to be sufficient, but will be reviewed if required
111+
//if image_split is not composite it must be blank
112+
image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
113+
//if "image_split" field is no, must be blank
114+
//else it must be a uuid4 or comma separated list of up to 9 uuid4s
115+
//due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
116+
image_crop: is("auto") or is("manual") or is("none")
117+
//must be string: auto; or string: manual or string: none (precisely - case as shown)
118+
image_deskew: is("yes") or is("no")
119+
//must be string: yes; or string: no (precisely - case as shown)
120+
comments: regex("[\w\s,.]+") @optional

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,21 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
516516
}
517517
}
518518

519+
"Url decode string provider" should {
520+
"decode url string to normal string" in {
521+
validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true
522+
}
523+
524+
"fail for wrong url" in {
525+
validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true
526+
}
527+
528+
"decode URL with optional charset parameter" in {
529+
530+
validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true
531+
}
532+
}
533+
519534
"Concat string provider" should {
520535
"should concatenate string provider" in {
521536
validate(TextFile(Path.fromString(base) / "concatPass.csv"), parse(base + "/concat.csvs"), None).isSuccess mustEqual true

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,13 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc
108108
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true
109109
}
110110

111+
"Validate WO 95 with 1.2 schema version to test backward compatibility" in {
112+
113+
val substitutionPaths = List(("file:///WO_95",WO95Path))
114+
val validator = buildValidator(substitutionPaths)
115+
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs",validator), None).isSuccess mustEqual true
116+
}
117+
111118

112119
"succeed with alternative substitution paths - header" in {
113120

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/UtilSpec.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class UtilSpec extends Specification with TestResources {
5353

5454
val apiFiles = Util.findAllFiles(true, new File(acceptancePath))
5555

56-
apiFiles must haveLength(120)
56+
apiFiles must haveLength(125)
5757

5858
apiFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/acceptance/twoRulesPassMetaData.csv"))
5959

@@ -65,7 +65,7 @@ class UtilSpec extends Specification with TestResources {
6565

6666
val integrityCheckFiles = Util.findAllFiles(true, new File(base))
6767

68-
integrityCheckFiles must haveLength(42)
68+
integrityCheckFiles must haveLength(43)
6969

7070
integrityCheckFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs"))
7171

@@ -82,7 +82,7 @@ class UtilSpec extends Specification with TestResources {
8282

8383
val integrityCheckFilesNoFolder = Util.findAllFiles(false, new File(base))
8484

85-
integrityCheckFilesNoFolder must haveLength(28)
85+
integrityCheckFilesNoFolder must haveLength(29)
8686

8787
integrityCheckFilesNoFolder must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/content/file1"))
8888

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaSpecBase.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ trait SchemaSpecBase extends Specification {
2222
def buildSchema1_1(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
2323
Schema(globalDirective.toList, columnDefinition.toList, "1.1")
2424

25+
def buildSchema1_2(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
26+
Schema(globalDirective.toList, columnDefinition.toList, "1.2")
27+
2528
def namedColumn(name: String) = ColumnDefinition(NamedColumnIdentifier(name))
2629

2730
def nonEmptyColumn(name: String): ColumnDefinition =

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/v1_1/SchemaParserVersionSpec.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class SchemaParserVersionSpec extends SchemaSpecBase with TestResources{
5151
LastName: @IgnoreCase regex ("[a]")"""
5252

5353
parse(new StringReader(schema)) must beLike {
54-
case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.1 and below."
54+
case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.2 and below."
5555
}
5656
}
5757

0 commit comments

Comments
 (0)