Skip to content

Commit 8058484

Browse files
committed
Sparkling integration updates.
* fix discardDate issue * update tests for #494 * add test for #493 * add test for #532 * move issue specific tests to their own directory * add copyright statement to SparklingArchiveRecord * move webarchive-commons back to 1.1.9 * resolves #532 * resolves #494 * resolves #493 * resolves #492 * resolves #317 * resolves #260 * resolves #182 * resolves #76 * resolves #74 * resolves #73 * resolves #23 * resolves #18
1 parent c0d2228 commit 8058484

File tree

7 files changed

+99
-28
lines changed

7 files changed

+99
-28
lines changed

pom.xml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,12 @@
107107
<resource>META-INF/services/org.apache.lucene.codecs.Codec</resource>
108108
</transformer>
109109
</transformers>
110-
111110
<relocations>
112111
<relocation>
113112
<pattern>com.google.common.</pattern>
114113
<shadedPattern>com.google.common.shaded.</shadedPattern>
115114
</relocation>
116115
</relocations>
117-
118116
<!-- This fixes the issue "Invalid signature file digest for Manifest main attributes"
119117
cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html -->
120118
<filters>
@@ -483,7 +481,7 @@
483481
<dependency>
484482
<groupId>org.netpreserve.commons</groupId>
485483
<artifactId>webarchive-commons</artifactId>
486-
<version>1.1.8</version>
484+
<version>1.1.9</version>
487485
<exclusions>
488486
<exclusion>
489487
<groupId>org.apache.hadoop</groupId>

src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
/*
2+
* Copyright © 2017 The Archives Unleashed Project
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
117
package io.archivesunleashed
218

319
import java.io.InputStream

src/main/scala/io/archivesunleashed/package.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -814,8 +814,11 @@ package object archivesunleashed {
814814
*
815815
* @param date a list of dates
816816
*/
817-
def discardDate(date: String): RDD[ArchiveRecord] = {
818-
rdd.filter(r => r.getCrawlDate != date)
817+
def discardDate(
818+
dates: List[String],
819+
component: DateComponent = DateComponent.YYYYMMDD
820+
): RDD[ArchiveRecord] = {
821+
rdd.filter(r => !dates.contains(ExtractDate(r.getCrawlDate, component)))
819822
}
820823

821824
/** Filters detected URLs.
55.9 KB
Binary file not shown.

src/test/scala/io/archivesunleashed/RecordRDDTest.scala

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
2727
@RunWith(classOf[JUnitRunner])
2828
class RecordRDDTest extends FunSuite with BeforeAndAfter {
2929
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
30-
private val badPath = Resources.getResource("arc/badexample.arc.gz").getPath
3130
private val master = "local[4]"
3231
private val appName = "example-spark"
3332
private var sc: SparkContext = _
@@ -43,24 +42,6 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
4342
sc = new SparkContext(conf)
4443
}
4544

46-
test("Expect no valid pages RDD") {
47-
val expectedLength = 0
48-
val base = RecordLoader
49-
.loadArchives(badPath, sc)
50-
.keepValidPages()
51-
.take(2)
52-
assert(base.length == expectedLength)
53-
}
54-
55-
test("Expect no images RDD") {
56-
val expectedLength = 0
57-
val base = RecordLoader
58-
.loadArchives(badPath, sc)
59-
.keepValidPages()
60-
.take(2)
61-
assert(base.length == expectedLength)
62-
}
63-
6445
test("Keep date RDD") {
6546
val testDate = "2008"
6647
val base = RecordLoader.loadArchives(arcPath, sc)
@@ -220,10 +201,17 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
220201

221202
test("Discard date RDD") {
222203
val base = RecordLoader.loadArchives(arcPath, sc)
223-
val date = "20080430"
224-
val r = base.filter(x => !(x.getCrawlDate.contains(date))).collect()
225-
val r2 = base.discardDate(date).take(3)
226-
assert(r.deep == Array().deep)
204+
val date = "2007"
205+
val dateComponent = DateComponent.YYYY
206+
val r = base
207+
.filter(x => ExtractDate(x.getCrawlDate, dateComponent) != date)
208+
.map(mp => mp.getUrl)
209+
.take(3)
210+
val r2 = base
211+
.discardDate(List(date), dateComponent)
212+
.map(mp => mp.getUrl)
213+
.take(3)
214+
assert(r2.sameElements(r))
227215
}
228216

229217
test("Discard URLs RDD") {
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright © 2017 The Archives Unleashed Project
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.archivesunleashed.app
18+
19+
import com.google.common.io.Resources
20+
import io.archivesunleashed.RecordLoader
21+
import io.archivesunleashed.udfs.{removeHTML, removeHTTPHeader}
22+
import org.apache.spark.sql.SparkSession
23+
import org.apache.spark.{SparkConf, SparkContext}
24+
import org.junit.runner.RunWith
25+
import org.scalatest.junit.JUnitRunner
26+
import org.scalatest.{BeforeAndAfter, FunSuite}
27+
28+
@RunWith(classOf[JUnitRunner])
29+
class Issue493Test extends FunSuite with BeforeAndAfter {
30+
private val arcPath = Resources.getResource("warc/issue-493.warc").getPath
31+
private val master = "local[4]"
32+
private val appName = "example-spark"
33+
private var sc: SparkContext = _
34+
35+
before {
36+
val conf = new SparkConf()
37+
.setMaster(master)
38+
.setAppName(appName)
39+
sc = new SparkContext(conf)
40+
}
41+
42+
test("Test for issue 493 - compressed payload warcs") {
43+
val df = RecordLoader.loadArchives(arcPath, sc).webpages()
44+
45+
// We need this in order to use the $-notation
46+
val spark = SparkSession.builder().master("local").getOrCreate()
47+
// scalastyle:off
48+
import spark.implicits._
49+
// scalastyle:on
50+
51+
val dfResults = df
52+
.select(removeHTML(removeHTTPHeader($"content")))
53+
.head(2)
54+
val RESULTSLENGTH = 2
55+
56+
assert(dfResults.length == RESULTSLENGTH)
57+
assert(dfResults(0).get(0) == "makkaronisch fuer niedlich")
58+
assert(dfResults(1).get(0) == "makkaronisch fuer niedlich die melodie")
59+
}
60+
61+
after {
62+
if (sc != null) {
63+
sc.stop()
64+
}
65+
}
66+
}

src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala renamed to src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala

File renamed without changes.

0 commit comments

Comments
 (0)