Skip to content

Commit 81e4203

Browse files
committed
Merge remote-tracking branch 'origin/main' into release/4.0.3
2 parents 4b18e96 + c014ac9 commit 81e4203

File tree

11 files changed

+196
-134
lines changed

11 files changed

+196
-134
lines changed

Dockerfile

Lines changed: 119 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -1,154 +1,149 @@
1-
FROM ubuntu:20.04
1+
FROM python:3.9-bookworm
22
ENV DEBIAN_FRONTEND noninteractive
33

44
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
55
LABEL org.opencontainers.image.licenses MIT
66
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
77

88
# Enable non-free archive for `unrar`.
9-
# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
10-
RUN apt-get -qq -y update \
11-
&& apt-get -qq -y install build-essential locales ca-certificates \
12-
# git
13-
git \
14-
# python deps (mostly to install their dependencies)
15-
python3-pip python3-dev python3-pil \
16-
# tesseract
17-
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
18-
# libraries
19-
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
20-
zlib1g-dev libicu-dev libxml2-dev \
21-
# package tools
22-
unrar p7zip-full \
23-
# audio & video metadata
24-
libmediainfo-dev \
25-
# image processing, djvu
26-
imagemagick-common imagemagick mdbtools djvulibre-bin \
27-
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
28-
libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
29-
pst-utils \
30-
### tesseract
31-
tesseract-ocr-eng \
32-
tesseract-ocr-swa \
33-
tesseract-ocr-swe \
34-
# tesseract-ocr-tam \
35-
# tesseract-ocr-tel \
36-
tesseract-ocr-fil \
37-
# tesseract-ocr-tha \
38-
tesseract-ocr-tur \
39-
tesseract-ocr-ukr \
40-
# tesseract-ocr-vie \
41-
tesseract-ocr-nld \
42-
tesseract-ocr-nor \
43-
tesseract-ocr-pol \
44-
tesseract-ocr-por \
45-
tesseract-ocr-ron \
46-
tesseract-ocr-rus \
47-
tesseract-ocr-slk \
48-
tesseract-ocr-slv \
49-
tesseract-ocr-spa \
50-
# tesseract-ocr-spa_old \
51-
tesseract-ocr-sqi \
52-
tesseract-ocr-srp \
53-
tesseract-ocr-ind \
54-
tesseract-ocr-isl \
55-
tesseract-ocr-ita \
56-
# tesseract-ocr-ita_old \
57-
# tesseract-ocr-jpn \
58-
tesseract-ocr-kan \
59-
tesseract-ocr-kat \
60-
# tesseract-ocr-kor \
61-
tesseract-ocr-khm \
62-
tesseract-ocr-lav \
63-
tesseract-ocr-lit \
64-
# tesseract-ocr-mal \
65-
tesseract-ocr-mkd \
66-
tesseract-ocr-mya \
67-
tesseract-ocr-mlt \
68-
tesseract-ocr-msa \
69-
tesseract-ocr-est \
70-
# tesseract-ocr-eus \
71-
tesseract-ocr-fin \
72-
tesseract-ocr-fra \
73-
tesseract-ocr-frk \
74-
# tesseract-ocr-frm \
75-
# tesseract-ocr-glg \
76-
# tesseract-ocr-grc \
77-
tesseract-ocr-heb \
78-
tesseract-ocr-hin \
79-
tesseract-ocr-hrv \
80-
tesseract-ocr-hye \
81-
tesseract-ocr-hun \
82-
# tesseract-ocr-ben \
83-
tesseract-ocr-bul \
84-
tesseract-ocr-cat \
85-
tesseract-ocr-ces \
86-
tesseract-ocr-nep \
87-
# tesseract-ocr-chi_sim \
88-
# tesseract-ocr-chi_tra \
89-
# tesseract-ocr-chr \
90-
tesseract-ocr-dan \
91-
tesseract-ocr-deu \
92-
tesseract-ocr-ell \
93-
# tesseract-ocr-enm \
94-
# tesseract-ocr-epo \
95-
# tesseract-ocr-equ \
96-
tesseract-ocr-afr \
97-
tesseract-ocr-ara \
98-
tesseract-ocr-aze \
99-
tesseract-ocr-bel \
100-
tesseract-ocr-uzb \
101-
### pdf convert: libreoffice + a bunch of fonts
102-
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
103-
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
104-
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
105-
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
106-
fonts-tlwg-purisa \
107-
###
108-
&& apt-get -qq -y autoremove \
109-
&& apt-get clean \
110-
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
111-
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
9+
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
10+
&& apt-get -qq -y update \
11+
&& apt-get -qq -y install build-essential locales \
12+
# python deps (mostly to install their dependencies)
13+
python3-dev \
14+
# tesseract
15+
tesseract-ocr libtesseract-dev libleptonica-dev \
16+
# libraries
17+
libldap2-dev libsasl2-dev \
18+
# package tools
19+
unrar p7zip-full \
20+
# audio & video metadata
21+
libmediainfo-dev \
22+
# image processing, djvu
23+
mdbtools djvulibre-bin \
24+
libtiff5-dev \
25+
libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
26+
pst-utils libgif-dev \
27+
### tesseract
28+
tesseract-ocr-eng \
29+
tesseract-ocr-swa \
30+
tesseract-ocr-swe \
31+
# tesseract-ocr-tam \
32+
# tesseract-ocr-tel \
33+
tesseract-ocr-fil \
34+
# tesseract-ocr-tha \
35+
tesseract-ocr-tur \
36+
tesseract-ocr-ukr \
37+
# tesseract-ocr-vie \
38+
tesseract-ocr-nld \
39+
tesseract-ocr-nor \
40+
tesseract-ocr-pol \
41+
tesseract-ocr-por \
42+
tesseract-ocr-ron \
43+
tesseract-ocr-rus \
44+
tesseract-ocr-slk \
45+
tesseract-ocr-slv \
46+
tesseract-ocr-spa \
47+
# tesseract-ocr-spa_old \
48+
tesseract-ocr-sqi \
49+
tesseract-ocr-srp \
50+
tesseract-ocr-ind \
51+
tesseract-ocr-isl \
52+
tesseract-ocr-ita \
53+
# tesseract-ocr-ita_old \
54+
# tesseract-ocr-jpn \
55+
tesseract-ocr-kan \
56+
tesseract-ocr-kat \
57+
# tesseract-ocr-kor \
58+
tesseract-ocr-khm \
59+
tesseract-ocr-lav \
60+
tesseract-ocr-lit \
61+
# tesseract-ocr-mal \
62+
tesseract-ocr-mkd \
63+
tesseract-ocr-mya \
64+
tesseract-ocr-mlt \
65+
tesseract-ocr-msa \
66+
tesseract-ocr-est \
67+
# tesseract-ocr-eus \
68+
tesseract-ocr-fin \
69+
tesseract-ocr-fra \
70+
tesseract-ocr-frk \
71+
# tesseract-ocr-frm \
72+
# tesseract-ocr-glg \
73+
# tesseract-ocr-grc \
74+
tesseract-ocr-heb \
75+
tesseract-ocr-hin \
76+
tesseract-ocr-hrv \
77+
tesseract-ocr-hye \
78+
tesseract-ocr-hun \
79+
# tesseract-ocr-ben \
80+
tesseract-ocr-bul \
81+
tesseract-ocr-cat \
82+
tesseract-ocr-ces \
83+
tesseract-ocr-nep \
84+
# tesseract-ocr-chi_sim \
85+
# tesseract-ocr-chi_tra \
86+
# tesseract-ocr-chr \
87+
tesseract-ocr-dan \
88+
tesseract-ocr-deu \
89+
tesseract-ocr-ell \
90+
# tesseract-ocr-enm \
91+
# tesseract-ocr-epo \
92+
# tesseract-ocr-equ \
93+
tesseract-ocr-afr \
94+
tesseract-ocr-ara \
95+
tesseract-ocr-aze \
96+
tesseract-ocr-bel \
97+
tesseract-ocr-uzb \
98+
### pdf convert: libreoffice + a bunch of fonts
99+
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
100+
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
101+
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
102+
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
103+
fonts-tlwg-purisa \
104+
###
105+
&& apt-get -qq -y autoremove \
106+
&& apt-get clean \
107+
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
108+
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
112109

113110
# Set up the locale and make sure the system uses unicode for the file system.
114111
ENV LANG='en_US.UTF-8' \
115-
TZ='UTC' \
116-
OMP_THREAD_LIMIT='1' \
117-
OPENBLAS_NUM_THREADS='1'
112+
TZ='UTC' \
113+
OMP_THREAD_LIMIT='1' \
114+
OPENBLAS_NUM_THREADS='1'
118115

119116
RUN groupadd -g 1000 -r app \
120-
&& useradd -m -u 1000 -s /bin/false -g app app
117+
&& useradd -m -u 1000 -s /bin/false -g app app
121118

122119
# Download the ftm-typepredict model
123120
RUN mkdir /models/ && \
124-
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
121+
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
125122

126123
COPY requirements.txt /tmp/
127-
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128-
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129124
RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt
130125

131126
# Install spaCy models
132127
RUN python3 -m spacy download en_core_web_sm \
133-
&& python3 -m spacy download de_core_news_sm \
134-
&& python3 -m spacy download fr_core_news_sm \
135-
&& python3 -m spacy download es_core_news_sm
128+
&& python3 -m spacy download de_core_news_sm \
129+
&& python3 -m spacy download fr_core_news_sm \
130+
&& python3 -m spacy download es_core_news_sm
136131
RUN python3 -m spacy download ru_core_news_sm \
137-
&& python3 -m spacy download pt_core_news_sm \
138-
&& python3 -m spacy download ro_core_news_sm \
139-
&& python3 -m spacy download mk_core_news_sm
132+
&& python3 -m spacy download pt_core_news_sm \
133+
&& python3 -m spacy download ro_core_news_sm \
134+
&& python3 -m spacy download mk_core_news_sm
140135
RUN python3 -m spacy download el_core_news_sm \
141-
&& python3 -m spacy download pl_core_news_sm \
142-
&& python3 -m spacy download it_core_news_sm \
143-
&& python3 -m spacy download lt_core_news_sm \
144-
&& python3 -m spacy download nl_core_news_sm \
145-
&& python3 -m spacy download nb_core_news_sm \
146-
&& python3 -m spacy download da_core_news_sm
136+
&& python3 -m spacy download pl_core_news_sm \
137+
&& python3 -m spacy download it_core_news_sm \
138+
&& python3 -m spacy download lt_core_news_sm \
139+
&& python3 -m spacy download nl_core_news_sm \
140+
&& python3 -m spacy download nb_core_news_sm \
141+
&& python3 -m spacy download da_core_news_sm
147142
# RUN python3 -m spacy download zh_core_web_sm
148143

149144
COPY . /ingestors
150145
WORKDIR /ingestors
151-
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
146+
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
152147
RUN chown -R app:app /ingestors
153148

154149
ENV ARCHIVE_TYPE=file \

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ format-check:
3636
black --check .
3737

3838
test: services
39-
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
39+
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
4040

4141
restart: build
4242
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file

docker-compose.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
version: "3.2"
2-
31
services:
42
postgres:
53
image: postgres:10.0

ingestors/cli.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
from ftmstore import get_dataset
99
from servicelayer.cache import get_redis
1010
from servicelayer.logs import configure_logging
11-
from servicelayer.taskqueue import Dataset, Task
11+
from servicelayer.taskqueue import (
12+
Dataset,
13+
Task,
14+
get_rabbitmq_channel,
15+
declare_rabbitmq_queue,
16+
)
1217
from servicelayer import settings as sl_settings
1318
from servicelayer.archive.util import ensure_path
1419
from servicelayer import settings as sls
@@ -78,6 +83,7 @@ def _ingest_path(db, dataset, path, languages=[]):
7883
entity.make_id(checksum)
7984
entity.set("fileName", path.name)
8085
log.info("Queue: %r", entity.to_dict())
86+
8187
manager.queue_entity(entity)
8288
if path.is_dir():
8389
DirectoryIngestor.crawl(manager, path)
@@ -116,6 +122,7 @@ def analyze(dataset):
116122
def debug(path, languages=None):
117123
"""Debug the ingest for the given path."""
118124
settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3"
125+
settings.TESTING = True
119126

120127
# collection ID that is meant for testing purposes only
121128
debug_datatset_id = 100
@@ -126,6 +133,13 @@ def debug(path, languages=None):
126133
database_uri=settings.fts.DATABASE_URI,
127134
)
128135
db.delete()
136+
channel = get_rabbitmq_channel()
137+
qos_mapping = {
138+
settings.STAGE_INGEST: settings.RABBITMQ_QOS_INGEST_QUEUE,
139+
settings.STAGE_ANALYZE: settings.RABBITMQ_QOS_ANALYZE_QUEUE,
140+
}
141+
for queue_name in qos_mapping.keys():
142+
declare_rabbitmq_queue(channel, queue_name, qos_mapping[queue_name])
129143
_ingest_path(db, debug_datatset_id, path, languages=languages)
130144
worker = get_worker()
131145
worker.process(blocking=False)

ingestors/tabular/ods.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ def ingest(self, file_path, entity):
6363
table = self.manager.make_entity("Table", parent=entity)
6464
table.make_id(entity.id, name)
6565
table.set("title", name)
66+
# add workbook metadata to individual tables
67+
for metadatum in [
68+
"authoredAt",
69+
"author",
70+
"summary",
71+
"generator",
72+
"date",
73+
"processingAgent",
74+
]:
75+
table.set(metadatum, entity.get(metadatum))
6676
# Emit a partial table fragment with parent reference and name
6777
# early, so that we don't have orphan fragments in case of an error
6878
# in the middle of processing.

ingestors/tabular/xls.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,17 @@ def ingest(self, file_path, entity):
5959
table = self.manager.make_entity("Table", parent=entity)
6060
table.make_id(entity.id, sheet.name)
6161
table.set("title", sheet.name)
62+
# add workbook metadata to individual tables
63+
for metadatum in [
64+
"authoredAt",
65+
"modifiedAt",
66+
"author",
67+
"summary",
68+
"generator",
69+
"language",
70+
"processingAgent",
71+
]:
72+
table.set(metadatum, entity.get(metadatum))
6273
# Emit a partial table fragment with parent reference and name
6374
# early, so that we don't have orphan fragments in case of an error
6475
# in the middle of processing.

ingestors/tabular/xlsx.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,22 @@ def ingest(self, file_path, entity):
4848
table = self.manager.make_entity("Table", parent=entity)
4949
table.make_id(entity.id, name)
5050
table.set("title", name)
51+
# add workbook metadata to individual tables
52+
for metadatum in [
53+
"authoredAt",
54+
"modifiedAt",
55+
"author",
56+
"summary",
57+
"generator",
58+
"language",
59+
"processingAgent",
60+
]:
61+
table.set(metadatum, entity.get(metadatum))
5162
# Emit a partial table fragment with parent reference and name
5263
# early, so that we don't have orphan fragments in case of an error
5364
# in the middle of processing.
5465
# See https://github.com/alephdata/ingest-file/issues/171
5566
self.manager.emit_entity(table, fragment="initial")
56-
log.debug("Sheet: %s", name)
5767
self.emit_row_tuples(table, self.generate_rows(sheet))
5868
if table.has("csvHash"):
5969
self.manager.emit_entity(table)

0 commit comments

Comments
 (0)