Skip to content

Commit 344e5e2

Browse files
jack-gitdevd-hwangjalphonso
authored
simple annotation ingest (#3243) (#3245)
Dan was having github issues so I pushed this for him in case anyone has questions about this PR --------- Co-authored-by: d-hwang <[email protected]> Co-authored-by: Joseph Alphonso <[email protected]>
1 parent 3a60df6 commit 344e5e2

File tree

22 files changed

+1168
-9
lines changed

22 files changed

+1168
-9
lines changed

contrib/datawave-quickstart/bin/services/datawave/bootstrap-ingest.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ DW_DATAWAVE_INGEST_TEST_DATA_SKIP=${DW_DATAWAVE_INGEST_TEST_DATA_SKIP:-false}
5858
DW_DATAWAVE_INGEST_TEST_FILE_WIKI=${DW_DATAWAVE_INGEST_TEST_FILE_WIKI:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-wikipedia/src/test/resources/input/enwiki-20130305-pages-articles-brief.xml"}
5959
DW_DATAWAVE_INGEST_TEST_FILE_CSV=${DW_DATAWAVE_INGEST_TEST_FILE_CSV:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-csv/src/test/resources/input/my.csv"}
6060
DW_DATAWAVE_INGEST_TEST_FILE_JSON=${DW_DATAWAVE_INGEST_TEST_FILE_JSON:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-json/src/test/resources/input/tvmaze-api.json"}
61+
DW_DATAWAVE_INGEST_TEST_FILE_ANNOTATION=${DW_DATAWAVE_INGEST_TEST_FILE_ANNOTATION:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-annotation/src/test/resources/input/doubleAnnotation.json"}
6162
DW_DATAWAVE_INGEST_MEDIUM_FILE_WIKI=${DW_DATAWAVE_INGEST_MEDIUM_FILE_WIKI:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-wikipedia/src/test/resources/input/enwiki-20250519-pages-articles-medium.xml.gz"}
6263
DW_DATAWAVE_INGEST_DE_TEST_FILE_WIKI=${DW_DATAWAVE_INGEST_DE_TEST_FILE_WIKI:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-wikipedia/src/test/resources/input/dewiki-20250520-pages-articles-brief.xml"}
6364
DW_DATAWAVE_INGEST_ES_TEST_FILE_WIKI=${DW_DATAWAVE_INGEST_ES_TEST_FILE_WIKI:-"${DW_DATAWAVE_SOURCE_DIR}/warehouse/ingest-wikipedia/src/test/resources/input/eswiki-20250520-pages-articles-brief.xml"}
@@ -227,6 +228,27 @@ function datawaveIngestCsv() {
227228
launchIngestJob "${csvRawFile}"
228229
}
229230

231+
function datawaveIngestAnnotation() {
232+
233+
# Uses example ingest config: annotation-ingest-config.xml
234+
235+
# Again we use live-ingest.sh, but this time to ingest some annotation data
236+
237+
local annotationRawFile="${1}"
238+
local extraOpts="${2}"
239+
240+
[ -z "${annotationRawFile}" ] && error "Missing raw file argument" && return 1
241+
[ ! -f "${annotationRawFile}" ] && error "File not found: ${annotationRawFile}" && return 1
242+
243+
local annotationHdfsFile="${DW_DATAWAVE_INGEST_HDFS_BASEDIR}/$( basename ${annotationRawFile} )"
244+
local putFileCommand="hdfs dfs -copyFromLocal -f ${annotationRawFile} ${annotationHdfsFile}"
245+
246+
local inputFormat="datawave.ingest.annotation.mapreduce.input.SimpleAnnotationInputFormat"
247+
local jobCommand="${DW_DATAWAVE_INGEST_HOME}/bin/ingest/live-ingest.sh ${annotationHdfsFile} ${DW_DATAWAVE_INGEST_NUM_SHARDS} -inputFormat ${inputFormat} -data.name.override=annotation ${extraOpts}"
248+
249+
launchIngestJob "${annotationRawFile}"
250+
}
251+
230252
function datawaveIngestJson() {
231253

232254
# Uses example ingest config: myjson-ingest-config.xml
@@ -359,6 +381,7 @@ function datawaveIngestTarballName() {
359381
function datawaveIngestExamples() {
360382
# basic examples of each format
361383
datawaveIngestWikipedia ${DW_DATAWAVE_INGEST_TEST_FILE_WIKI}
384+
datawaveIngestAnnotation ${DW_DATAWAVE_INGEST_TEST_FILE_ANNOTATION}
362385
datawaveIngestJson ${DW_DATAWAVE_INGEST_TEST_FILE_JSON}
363386
datawaveIngestCsv ${DW_DATAWAVE_INGEST_TEST_FILE_CSV}
364387

coverage/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,16 @@
6262
<artifactId>datawave-ingest-nyctlc</artifactId>
6363
<version>${project.version}</version>
6464
</dependency>
65+
<dependency>
66+
<groupId>gov.nsa.datawave</groupId>
67+
<artifactId>datawave-annotation-core</artifactId>
68+
<version>${project.version}</version>
69+
</dependency>
70+
<dependency>
71+
<groupId>gov.nsa.datawave</groupId>
72+
<artifactId>datawave-ingest-annotation</artifactId>
73+
<version>${project.version}</version>
74+
</dependency>
6575
<dependency>
6676
<groupId>gov.nsa.datawave</groupId>
6777
<artifactId>datawave-ingest-wikipedia</artifactId>

pom.xml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,9 @@
133133
<version.objenesis>2.1</version.objenesis>
134134
<version.picketbox>5.0.3.Final</version.picketbox>
135135
<version.powermock>2.0.9</version.powermock>
136+
<version.proto-google-common-protos>2.61.0</version.proto-google-common-protos>
136137
<version.protobuf>3.16.3</version.protobuf>
138+
<version.protobuf-java-util>3.16.3</version.protobuf-java-util>
137139
<version.protostuff>1.6.2</version.protostuff>
138140
<version.slf4j>2.0.12</version.slf4j>
139141
<version.spotify-dns>3.1.5</version.spotify-dns>
@@ -225,6 +227,11 @@
225227
<artifactId>caffeine</artifactId>
226228
<version>${version.caffeine}</version>
227229
</dependency>
230+
<dependency>
231+
<groupId>com.google.api.grpc</groupId>
232+
<artifactId>proto-google-common-protos</artifactId>
233+
<version>${version.proto-google-common-protos}</version>
234+
</dependency>
228235
<dependency>
229236
<groupId>com.google.code.findbugs</groupId>
230237
<artifactId>annotations</artifactId>
@@ -246,6 +253,17 @@
246253
<artifactId>protobuf-java</artifactId>
247254
<version>${version.protobuf}</version>
248255
</dependency>
256+
<dependency>
257+
<groupId>com.google.protobuf</groupId>
258+
<artifactId>protobuf-java-util</artifactId>
259+
<version>${version.protobuf}</version>
260+
<exclusions>
261+
<exclusion>
262+
<groupId>*</groupId>
263+
<artifactId>*</artifactId>
264+
</exclusion>
265+
</exclusions>
266+
</dependency>
249267
<dependency>
250268
<groupId>com.googlecode.json-simple</groupId>
251269
<artifactId>json-simple</artifactId>

properties/compose.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ LIVE_CHILD_MAP_MAX_MEMORY_MB=1024
4242
BULK_CHILD_REDUCE_MAX_MEMORY_MB=2048
4343
LIVE_CHILD_REDUCE_MAX_MEMORY_MB=1024
4444

45-
BULK_INGEST_DATA_TYPES=shardStats
46-
LIVE_INGEST_DATA_TYPES=wikipedia,mycsv,myjson
45+
BULK_INGEST_DATA_TYPES=shardStats,annotation
46+
LIVE_INGEST_DATA_TYPES=wikipedia,mycsv,myjson,annotation
4747

4848
# Clear out these values if you do not want standard shard ingest.
4949
DEFAULT_SHARD_HANDLER_CLASSES=datawave.ingest.mapreduce.handler.shard.AbstractColumnBasedHandler

properties/dev.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ LIVE_CHILD_MAP_MAX_MEMORY_MB=1024
3939
BULK_CHILD_REDUCE_MAX_MEMORY_MB=2048
4040
LIVE_CHILD_REDUCE_MAX_MEMORY_MB=1024
4141

42-
BULK_INGEST_DATA_TYPES=shardStats,wikipedia,mycsv,myjson
43-
LIVE_INGEST_DATA_TYPES=wikipedia,mycsv,myjson
42+
BULK_INGEST_DATA_TYPES=shardStats,wikipedia,mycsv,myjson,annotation
43+
LIVE_INGEST_DATA_TYPES=wikipedia,mycsv,myjson,annotation
4444

4545
# Clear out these values if you do not want standard shard ingest.
4646
DEFAULT_SHARD_HANDLER_CLASSES=datawave.ingest.mapreduce.handler.shard.AbstractColumnBasedHandler

warehouse/annotation-core/pom.xml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,15 @@
99
<artifactId>datawave-annotation-core</artifactId>
1010
<properties>
1111
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
12-
<version.proto-google-common-protos>2.61.0</version.proto-google-common-protos>
13-
<version.protobuf>3.16.3</version.protobuf>
1412
</properties>
1513
<dependencies>
1614
<dependency>
1715
<groupId>com.google.api.grpc</groupId>
1816
<artifactId>proto-google-common-protos</artifactId>
19-
<version>${version.proto-google-common-protos}</version>
2017
</dependency>
2118
<dependency>
2219
<groupId>com.google.protobuf</groupId>
2320
<artifactId>protobuf-java</artifactId>
24-
<version>${version.protobuf}</version>
2521
<exclusions>
2622
<exclusion>
2723
<groupId>*</groupId>
@@ -32,7 +28,6 @@
3228
<dependency>
3329
<groupId>com.google.protobuf</groupId>
3430
<artifactId>protobuf-java-util</artifactId>
35-
<version>${version.protobuf}</version>
3631
<exclusions>
3732
<exclusion>
3833
<groupId>*</groupId>

warehouse/assemble/datawave/pom.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,14 @@
2525
<groupId>com.esotericsoftware.minlog</groupId>
2626
<artifactId>minlog</artifactId>
2727
</dependency>
28+
<dependency>
29+
<groupId>com.google.api.grpc</groupId>
30+
<artifactId>proto-google-common-protos</artifactId>
31+
</dependency>
32+
<dependency>
33+
<groupId>com.google.protobuf</groupId>
34+
<artifactId>protobuf-java-util</artifactId>
35+
</dependency>
2836
<dependency>
2937
<groupId>com.googlecode.json-simple</groupId>
3038
<artifactId>json-simple</artifactId>
@@ -37,6 +45,11 @@
3745
<groupId>gov.nsa.datawave</groupId>
3846
<artifactId>datawave-accumulo-extensions</artifactId>
3947
</dependency>
48+
<dependency>
49+
<groupId>gov.nsa.datawave</groupId>
50+
<artifactId>datawave-annotation-core</artifactId>
51+
<version>${project.version}</version>
52+
</dependency>
4053
<dependency>
4154
<groupId>gov.nsa.datawave</groupId>
4255
<artifactId>datawave-common</artifactId>
@@ -62,6 +75,11 @@
6275
<artifactId>datawave-index-stats</artifactId>
6376
<version>${project.version}</version>
6477
</dependency>
78+
<dependency>
79+
<groupId>gov.nsa.datawave</groupId>
80+
<artifactId>datawave-ingest-annotation</artifactId>
81+
<version>${project.version}</version>
82+
</dependency>
6583
<dependency>
6684
<groupId>gov.nsa.datawave</groupId>
6785
<artifactId>datawave-ingest-configuration</artifactId>
@@ -694,6 +712,8 @@
694712
<include>${project.groupId}:datawave-data-dictionary-core</include>
695713
<include>${project.groupId}:datawave-edge-dictionary-core</include>
696714
<include>${project.groupId}:datawave-edge-model-configuration-core</include>
715+
<include>${project.groupId}:datawave-annotation-core</include>
716+
<include>${project.groupId}:datawave-ingest-annotation</include>
697717
<include>${project.groupId}:datawave-ingest-wikipedia</include>
698718
<include>${project.groupId}:datawave-ingest-nyctlc</include>
699719
<include>${project.groupId}:datawave-ingest-ssdeep</include>
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
<parent>
5+
<groupId>gov.nsa.datawave</groupId>
6+
<artifactId>datawave-warehouse-parent</artifactId>
7+
<version>7.33.0-SNAPSHOT</version>
8+
</parent>
9+
<artifactId>datawave-ingest-annotation</artifactId>
10+
<packaging>jar</packaging>
11+
<name>${project.artifactId}</name>
12+
<dependencies>
13+
<dependency>
14+
<groupId>gov.nsa.datawave</groupId>
15+
<artifactId>datawave-annotation-core</artifactId>
16+
<version>${project.version}</version>
17+
</dependency>
18+
<dependency>
19+
<groupId>gov.nsa.datawave</groupId>
20+
<artifactId>datawave-core</artifactId>
21+
<version>${project.version}</version>
22+
</dependency>
23+
<dependency>
24+
<groupId>gov.nsa.datawave</groupId>
25+
<artifactId>datawave-ingest-configuration</artifactId>
26+
<version>${project.version}</version>
27+
</dependency>
28+
<dependency>
29+
<groupId>gov.nsa.datawave</groupId>
30+
<artifactId>datawave-ingest-core</artifactId>
31+
<version>${project.version}</version>
32+
</dependency>
33+
<dependency>
34+
<groupId>gov.nsa.datawave</groupId>
35+
<artifactId>datawave-ingest-core</artifactId>
36+
</dependency>
37+
<dependency>
38+
<groupId>gov.nsa.datawave.webservices</groupId>
39+
<artifactId>datawave-ws-common-util</artifactId>
40+
<version>${project.version}</version>
41+
</dependency>
42+
<dependency>
43+
<groupId>org.apache.hadoop</groupId>
44+
<artifactId>hadoop-distcp</artifactId>
45+
</dependency>
46+
<dependency>
47+
<groupId>org.apache.hadoop</groupId>
48+
<artifactId>hadoop-mapreduce-client-common</artifactId>
49+
</dependency>
50+
<dependency>
51+
<groupId>org.apache.hadoop</groupId>
52+
<artifactId>hadoop-mapreduce-client-core</artifactId>
53+
</dependency>
54+
<dependency>
55+
<groupId>org.apache.lucene</groupId>
56+
<artifactId>lucene-analyzers-common</artifactId>
57+
</dependency>
58+
<dependency>
59+
<groupId>org.apache.lucene</groupId>
60+
<artifactId>lucene-core</artifactId>
61+
</dependency>
62+
<dependency>
63+
<groupId>org.easymock</groupId>
64+
<artifactId>easymock</artifactId>
65+
</dependency>
66+
<dependency>
67+
<groupId>org.infinispan</groupId>
68+
<artifactId>infinispan-commons</artifactId>
69+
</dependency>
70+
<dependency>
71+
<groupId>org.powermock</groupId>
72+
<artifactId>powermock-api-easymock</artifactId>
73+
</dependency>
74+
<dependency>
75+
<groupId>org.powermock</groupId>
76+
<artifactId>powermock-module-junit4</artifactId>
77+
</dependency>
78+
<dependency>
79+
<groupId>com.fasterxml.woodstox</groupId>
80+
<artifactId>woodstox-core</artifactId>
81+
<scope>provided</scope>
82+
</dependency>
83+
<dependency>
84+
<groupId>org.apache.accumulo</groupId>
85+
<artifactId>accumulo-start</artifactId>
86+
<scope>provided</scope>
87+
</dependency>
88+
<dependency>
89+
<groupId>org.apache.commons</groupId>
90+
<artifactId>commons-configuration2</artifactId>
91+
<scope>provided</scope>
92+
</dependency>
93+
<dependency>
94+
<groupId>com.sun.mail</groupId>
95+
<artifactId>mailapi</artifactId>
96+
<version>1.4.5</version>
97+
<scope>test</scope>
98+
</dependency>
99+
<dependency>
100+
<groupId>gov.nsa.datawave.webservices</groupId>
101+
<artifactId>datawave-ws-common</artifactId>
102+
<scope>test</scope>
103+
</dependency>
104+
<dependency>
105+
<groupId>javax.mail</groupId>
106+
<artifactId>javax.mail-api</artifactId>
107+
<version>1.5.4</version>
108+
<scope>test</scope>
109+
</dependency>
110+
<dependency>
111+
<groupId>org.apache.accumulo</groupId>
112+
<artifactId>accumulo-minicluster</artifactId>
113+
<version>${version.accumulo}</version>
114+
<scope>test</scope>
115+
</dependency>
116+
<dependency>
117+
<groupId>org.javassist</groupId>
118+
<artifactId>javassist</artifactId>
119+
<scope>test</scope>
120+
</dependency>
121+
</dependencies>
122+
<build>
123+
<plugins>
124+
<plugin>
125+
<groupId>org.apache.maven.plugins</groupId>
126+
<artifactId>maven-jar-plugin</artifactId>
127+
<executions>
128+
<execution>
129+
<goals>
130+
<goal>test-jar</goal>
131+
</goals>
132+
</execution>
133+
</executions>
134+
</plugin>
135+
</plugins>
136+
</build>
137+
</project>

0 commit comments

Comments
 (0)