Skip to content

Commit bff0a4d

Browse files
committed
Implement #425
In spirit, at least
1 parent 591c84f commit bff0a4d

File tree

154 files changed

+704
-582
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

154 files changed

+704
-582
lines changed

backend/bootstrap.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from common.lib.queue import JobQueue
1010
from common.lib.database import Database
11+
from common.lib.module_loader import ModuleCollector
1112
from backend.lib.manager import WorkerManager
1213
from common.lib.logger import Logger
1314

@@ -66,9 +67,12 @@ def run(as_daemon=True, log_level="INFO"):
6667
config.with_db(db)
6768
config.ensure_database()
6869

70+
# load 4CAT modules and cache the results
71+
modules = ModuleCollector(config=config, write_cache=True)
72+
6973
# make it happen
7074
# this is blocking until the back-end is shut down
71-
WorkerManager(logger=log, database=db, queue=queue, as_daemon=as_daemon)
75+
WorkerManager(logger=log, database=db, queue=queue, modules=modules, as_daemon=as_daemon)
7276

7377
# clean up pidfile, if running as daemon
7478
if as_daemon:

backend/lib/manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import signal
55
import time
66

7-
from common.lib.module_loader import ModuleCollector
87
from common.lib.exceptions import JobClaimedException
98

109

@@ -22,19 +21,20 @@ class WorkerManager:
2221
pool = []
2322
looping = True
2423

25-
def __init__(self, queue, database, logger, as_daemon=True):
24+
def __init__(self, queue, database, logger, modules, as_daemon=True):
2625
"""
2726
Initialize manager
2827
2928
:param queue: Job queue
3029
:param database: Database handler
3130
:param logger: Logger object
31+
:param modules: Modules cache via ModuleLoader()
3232
:param bool as_daemon: Whether the manager is being run as a daemon
3333
"""
3434
self.queue = queue
3535
self.db = database
3636
self.log = logger
37-
self.modules = ModuleCollector(write_config=True)
37+
self.modules = modules
3838

3939
if as_daemon:
4040
signal.signal(signal.SIGTERM, self.abort)

backend/lib/processor.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from common.lib.helpers import get_software_commit, remove_nuls, send_email
2020
from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
2121
DataSetException, MapItemException)
22-
from common.config_manager import config, ConfigWrapper
22+
from common.config_manager import ConfigWrapper
2323
from common.lib.user import User
2424

2525

@@ -37,14 +37,14 @@ class BasicProcessor(FourcatModule, BasicWorker, metaclass=abc.ABCMeta):
3737
useful is another question).
3838
3939
To determine whether a processor can process a given dataset, you can
40-
define a `is_compatible_with(FourcatModule module=None, str user=None):) -> bool` class
40+
define a `is_compatible_with(FourcatModule module=None, config=None):) -> bool` class
4141
method which takes a dataset as argument and returns a bool that determines
4242
if this processor is considered compatible with that dataset. For example:
4343
4444
.. code-block:: python
4545
4646
@classmethod
47-
def is_compatible_with(cls, module=None, user=None):
47+
def is_compatible_with(cls, module=None, config=None):
4848
return module.type == "linguistic-features"
4949
5050
@@ -109,11 +109,10 @@ def work(self):
109109
self.job.finish()
110110
return
111111

112-
# set up config reader using the worker's DB connection and the dataset
113-
# creator. This ensures that if a value has been overriden for the owner,
114-
# the overridden value is used instead.
115-
config.with_db(self.db)
116-
self.config = ConfigWrapper(config=config, user=User.get_by_name(self.db, self.owner))
112+
# set up config reader wrapping the worker's config manager, which is
113+
# in turn the one passed to it by the WorkerManager, which is the one
114+
# originally loaded in bootstrap
115+
self.config = ConfigWrapper(config=self.config, user=User.get_by_name(self.db, self.owner))
117116

118117
if self.dataset.data.get("key_parent", None):
119118
# search workers never have parents (for now), so we don't need to
@@ -170,7 +169,7 @@ def work(self):
170169
# get parameters
171170
# if possible, fill defaults where parameters are not provided
172171
given_parameters = self.dataset.parameters.copy()
173-
all_parameters = self.get_options(self.dataset)
172+
all_parameters = self.get_options(self.dataset, config=self.config)
174173
self.parameters = {
175174
param: given_parameters.get(param, all_parameters.get(param, {}).get("default"))
176175
for param in [*all_parameters.keys(), *given_parameters.keys()]
@@ -179,7 +178,7 @@ def work(self):
179178
# now the parameters have been loaded into memory, clear any sensitive
180179
# ones. This has a side-effect that a processor may not run again
181180
# without starting from scratch, but this is the price of progress
182-
options = self.get_options(self.dataset.get_parent())
181+
options = self.get_options(self.dataset.get_parent(), config=self.config)
183182
for option, option_settings in options.items():
184183
if option_settings.get("sensitive"):
185184
self.dataset.delete_parameter(option)
@@ -241,7 +240,7 @@ def after_process(self):
241240
next_parameters = next.get("parameters", {})
242241
next_type = next.get("type", "")
243242
try:
244-
available_processors = self.dataset.get_available_processors(user=self.dataset.creator)
243+
available_processors = self.dataset.get_available_processors(config=self.config)
245244
except ValueError:
246245
self.log.info("Trying to queue next processor, but parent dataset no longer exists, halting")
247246
break
@@ -329,7 +328,7 @@ def after_process(self):
329328

330329
self.job.finish()
331330

332-
if config.get('mail.server') and self.dataset.get_parameters().get("email-complete", False):
331+
if self.config.get('mail.server') and self.dataset.get_parameters().get("email-complete", False):
333332
owner = self.dataset.get_parameters().get("email-complete", False)
334333
# Check that username is email address
335334
if re.match(r"[^@]+\@.*?\.[a-zA-Z]+", owner):
@@ -340,8 +339,8 @@ def after_process(self):
340339
import html2text
341340

342341
self.log.debug("Sending email to %s" % owner)
343-
dataset_url = ('https://' if config.get('flask.https') else 'http://') + config.get('flask.server_name') + '/results/' + self.dataset.key
344-
sender = config.get('mail.noreply')
342+
dataset_url = ('https://' if self.config.get('flask.https') else 'http://') + self.config.get('flask.server_name') + '/results/' + self.dataset.key
343+
sender = self.config.get('mail.noreply')
345344
message = MIMEMultipart("alternative")
346345
message["From"] = sender
347346
message["To"] = owner
@@ -778,7 +777,7 @@ def is_filter(cls):
778777
return hasattr(cls, "category") and cls.category and "filter" in cls.category.lower()
779778

780779
@classmethod
781-
def get_options(cls, parent_dataset=None, user=None):
780+
def get_options(cls, parent_dataset=None, config=None):
782781
"""
783782
Get processor options
784783
@@ -787,12 +786,11 @@ def get_options(cls, parent_dataset=None, user=None):
787786
fine-grained options, e.g. in cases where the availability of options
788787
is partially determined by the parent dataset's parameters.
789788
789+
:param config:
790790
:param DataSet parent_dataset: An object representing the dataset that
791791
the processor would be run on
792-
:param User user: Flask user the options will be displayed for, in
793-
case they are requested for display in the 4CAT web interface. This can
794-
be used to show some options only to privileges users.
795-
"""
792+
793+
796794
return cls.options if hasattr(cls, "options") else {}
797795
798796
@classmethod

backend/lib/search.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def process(self):
6363
query_parameters = self.dataset.get_parameters()
6464
results_file = self.dataset.get_results_path()
6565

66-
self.log.info("Querying: %s" % str({k: v for k, v in query_parameters.items() if not self.get_options().get(k, {}).get("sensitive", False)}))
66+
self.log.info("Querying: %s" % str({k: v for k, v in query_parameters.items() if not self.get_options(
67+
config=self.config).get(k, {}).get("sensitive", False)}))
6768

6869
# Execute the relevant query (string-based, random, countryflag-based)
6970
try:

backend/lib/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def __init__(self, logger, job, queue=None, manager=None, modules=None):
8686
self.manager = manager
8787
self.job = job
8888
self.init_time = int(time.time())
89-
self.config = ConfigDummy()
89+
self.config = modules.config
9090

9191
# ModuleCollector cannot be easily imported into a worker because it itself
9292
# imports all workers, so you get a recursive import that Python (rightly) blocks

backend/workers/api.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ class InternalAPI(BasicWorker):
1515

1616
ensure_job = {"remote_id": "localhost"}
1717

18-
host = config.get('API_HOST')
19-
port = config.get('API_PORT')
18+
host = None
19+
port = None
2020

2121
def work(self):
2222
"""
@@ -27,6 +27,9 @@ def work(self):
2727
2828
:return:
2929
"""
30+
self.host = self.config.get('API_HOST')
31+
self.port = self.config.get('API_PORT')
32+
3033
if self.port == 0:
3134
# if configured not to listen, just loop until the backend shuts
3235
# down we can't return here immediately, since this is a worker,

backend/workers/check_updates.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import requests
33
import json
44

5-
from common.config_manager import config
65
from common.lib.helpers import add_notification, get_github_version
76
from backend.lib.worker import BasicWorker
87
from pathlib import Path
@@ -22,11 +21,11 @@ class UpdateChecker(BasicWorker):
2221
max_workers = 1
2322

2423
# check once every three hours
25-
ensure_job = {"remote_id": config.get("4cat.github_url"), "interval": 10800}
24+
ensure_job = {"remote_id": self.config.get("4cat.github_url"), "interval": 10800}
2625

2726
def work(self):
28-
versionfile = Path(config.get("PATH_ROOT"), "config/.current-version")
29-
repo_url = config.get("4cat.github_url")
27+
versionfile = Path(self.config.get("PATH_ROOT"), "config/.current-version")
28+
repo_url = self.config.get("4cat.github_url")
3029

3130
if not versionfile.exists() or not repo_url:
3231
# need something to compare against...

backend/workers/cleanup_tempfiles.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from pathlib import Path
88

9-
from common.config_manager import config
109
from backend.lib.worker import BasicWorker
1110
from common.lib.dataset import DataSet
1211
from common.lib.exceptions import WorkerInterruptedException, DataSetException
@@ -34,7 +33,7 @@ def work(self):
3433
:return:
3534
"""
3635

37-
result_files = Path(config.get('PATH_DATA')).glob("*")
36+
result_files = Path(self.config.get('PATH_DATA')).glob("*")
3837
for file in result_files:
3938
if file.stem.startswith("."):
4039
# skip hidden files

backend/workers/datasource_metrics.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from datetime import datetime, time, timezone
1414

1515
from backend.lib.worker import BasicWorker
16-
from common.config_manager import config
1716

1817

1918
class DatasourceMetrics(BasicWorker):
@@ -52,9 +51,9 @@ def general_stats(self):
5251
this worker instead of on demand.
5352
"""
5453
metrics = {
55-
"size_data": DatasourceMetrics.folder_size(config.get("PATH_DATA")),
56-
"size_logs": DatasourceMetrics.folder_size(config.get("PATH_LOGS")),
57-
"size_db": self.db.fetchone("SELECT pg_database_size(%s) AS num", (config.get("DB_NAME"),))["num"]
54+
"size_data": DatasourceMetrics.folder_size(self.config.get("PATH_DATA")),
55+
"size_logs": DatasourceMetrics.folder_size(self.config.get("PATH_LOGS")),
56+
"size_db": self.db.fetchone("SELECT pg_database_size(%s) AS num", (self.config.get("DB_NAME"),))["num"]
5857
}
5958

6059
for metric, value in metrics.items():
@@ -95,7 +94,7 @@ def data_stats(self):
9594
""")
9695

9796
added_datasources = [row["datasource"] for row in self.db.fetchall("SELECT DISTINCT(datasource) FROM metrics")]
98-
enabled_datasources = config.get("datasources.enabled", {})
97+
enabled_datasources = self.config.get("datasources.enabled", {})
9998

10099
for datasource_id in self.modules.datasources:
101100
if datasource_id not in enabled_datasources:
@@ -121,7 +120,7 @@ def data_stats(self):
121120
elif datasource_id == "8chan":
122121
settings_id = "eightchan"
123122

124-
boards = [b for b in config.get(settings_id + "-search.boards", [])]
123+
boards = [b for b in self.config.get(settings_id + "-search.boards", [])]
125124

126125
# If a datasource is static (so not updated) and it
127126
# is already present in the metrics table, we don't

backend/workers/expire_items.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
"""
44
import datetime
55
import time
6-
import json
76
import re
87

98
from backend.lib.worker import BasicWorker
109
from common.lib.dataset import DataSet
1110
from common.lib.exceptions import DataSetNotFoundException, WorkerInterruptedException
1211

1312
from common.lib.user import User
13+
from common.config_manager import ConfigWrapper
1414

1515

1616
class ThingExpirer(BasicWorker):
@@ -58,9 +58,11 @@ def expire_datasets(self):
5858
if self.interrupted:
5959
raise WorkerInterruptedException("Interrupted while expiring datasets")
6060

61+
# the dataset creator's configuration context determines expiration
62+
wrapper = ConfigWrapper(self.config, user=dataset["creator"])
6163
try:
6264
dataset = DataSet(key=dataset["key"], db=self.db)
63-
if dataset.is_expired():
65+
if dataset.is_expired(config=wrapper):
6466
self.log.info(f"Deleting dataset {dataset.key} (expired)")
6567
dataset.delete()
6668

0 commit comments

Comments
 (0)