Skip to content
This repository was archived by the owner on Dec 31, 2025. It is now read-only.
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion ingestors/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@
from followthemoney.util import make_entity_id
from followthemoney.namespace import Namespace

import schwifty

from ingestors import settings
from ingestors.analysis.aggregate import TagAggregatorFasttext
from ingestors.analysis.aggregate import TagAggregator
from ingestors.analysis.extract import extract_entities
from ingestors.analysis.patterns import extract_patterns
from ingestors.analysis.language import detect_languages
from ingestors.analysis.util import TAG_COMPANY, TAG_PERSON
from ingestors.analysis.util import TAG_COMPANY, TAG_PERSON, TAG_IBAN
from ingestors.analysis.util import text_chunks, ANALYZABLE, DOCUMENT


log = logging.getLogger(__name__)


Expand Down Expand Up @@ -46,6 +49,7 @@ def feed(self, entity):
for prop, tag in extract_patterns(self.entity, text):
self.aggregator_patterns.add(prop, tag)


def flush(self):
writer = self.dataset.bulk()
countries = set()
Expand All @@ -60,13 +64,36 @@ def flush(self):
countries.add(key)

mention_ids = set()

# if there are ibanMentioned, we validate them with schwifty
# valid IBANs are used to create BankAccount FTM entities
# we keep track of how many we created
bank_accounts_created = 0

for key, prop, values in results:
label = values[0]
if prop.type == registry.name:
label = registry.name.pick(values)

if prop == TAG_IBAN:
try:
_ = schwifty.IBAN(label)
except schwifty.exceptions.SchwiftyException:
continue

if not schwifty.IBAN(label, allow_invalid=True).is_valid:
continue

bank_account = model.make_entity("BankAccount")
bank_account.make_id("mention", self.entity.id, prop, key)
bank_account.add("iban", label)
bank_account = self.ns.apply(bank_account)
writer.put(bank_account)
bank_accounts_created += 1

schema = self.MENTIONS.get(prop)
if schema is not None and self.entity.schema.is_a(DOCUMENT):

mention = model.make_entity("Mention")
mention.make_id("mention", self.entity.id, prop, key)
mention_ids.add(mention.id)
Expand All @@ -89,6 +116,8 @@ def flush(self):
self.entity.schema.name,
self.entity.id,
)
if bank_accounts_created:
log.debug(f"Created {bank_accounts_created} BankAccount entities")
writer.put(self.entity)
writer.flush()

Expand Down