-
Notifications
You must be signed in to change notification settings - Fork 1
feat: [sc-26105] Add first/last name tokenizer to NameAI #606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
The latest updates on your projects. Learn more about Vercel for Git ↗︎
|
def test_person_name_tokenizer_simple_names(): | ||
"""Verify tokenization of clear person names.""" | ||
with init_person_name_tokenizer([]) as tokenizer: | ||
from nameai.data import get_resource_path | ||
import json | ||
|
||
with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
quality_tests = json.load(f) | ||
|
||
failures = [] | ||
for input_label, expected_tokens in quality_tests['simple_names'].items(): | ||
tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
expected_tuple = tuple(expected_tokens) | ||
found = False | ||
for tokens, score in tokenized_labels: | ||
if tokens == expected_tuple: | ||
found = True | ||
assert score > -float('inf'), f'Expected valid score for {input_label}' | ||
break | ||
if not found: | ||
failures.append(f'Failed to find expected tokenization for {input_label}') | ||
|
||
if failures: | ||
print('\n=== PersonNameTokenizer Quality Test Failures [simple_names] ===') | ||
for failure in failures: | ||
print(failure) | ||
print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
assert False, 'Some tokenization quality tests failed. See above for details.' | ||
|
||
|
||
def test_person_name_tokenizer_ambiguous_names(): | ||
"""Verify handling of ambiguous inputs that could be names.""" | ||
with init_person_name_tokenizer([]) as tokenizer: | ||
from nameai.data import get_resource_path | ||
import json | ||
|
||
with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
quality_tests = json.load(f) | ||
|
||
failures = [] | ||
for input_label, interpretation2expected_tokens in quality_tests['ambiguous_names'].items(): | ||
tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
if interpretation2expected_tokens['person_name'] is not None: | ||
person_name_tokens = tuple(interpretation2expected_tokens['person_name']) | ||
found = False | ||
for tokens, score in tokenized_labels: | ||
if tokens == person_name_tokens: | ||
found = True | ||
assert score > -float('inf'), f'Expected valid score for {input_label}' | ||
break | ||
if not found: | ||
failures.append(f'Failed to find person name tokenization for {input_label}') | ||
|
||
if failures: | ||
print('\n=== PersonNameTokenizer Quality Test Failures [ambiguous_names] ===') | ||
for failure in failures: | ||
print(failure) | ||
print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
assert False, 'Some tokenization quality tests failed. See above for details.' | ||
|
||
|
||
def test_person_name_tokenizer_non_names_low_scores(): | ||
"""Verify that non-name inputs get low (< 1e-10) probability scores.""" | ||
with init_person_name_tokenizer([]) as tokenizer: | ||
from nameai.data import get_resource_path | ||
import json | ||
|
||
with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
quality_tests = json.load(f) | ||
|
||
failures = [] | ||
for input_label in quality_tests['non_names'].keys(): | ||
tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
for tokens, log_prob in tokenized_labels: | ||
if log_prob >= math.log(1e-10): | ||
failures.append(f'Expected very low score for non-name {input_label}, got {log_prob}') | ||
|
||
if failures: | ||
print('\n=== PersonNameTokenizer Quality Test Failures [non_names] ===') | ||
for failure in failures: | ||
print(failure) | ||
print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
assert False, 'Some tokenization quality tests failed. See above for details.' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these tests simply adding a probability score check compared to those from test_nlp_inspector.py
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In test_tokenizer.py
separate tokenizers are tested (AllTokenizer
and PersonNamesTokenizer
).
In test_nlp_inspector.py
the tokenizations come from both tokenizers (merging in done in NLPInspector
).
So these tests are for different levels of the tokenization pipeline.
It all seems good to me. One thing that is bothering me is maintaining 2 separate implementations of the same functionality. I would think of possibly substituting this functionality in NameGraph by using the implementation from here? @djstrong |
Story details: https://app.shortcut.com/ps-web3/story/26105
todo:
add s3 env vars to .env.examplemake bucket publicpython -m python -m nameai.download
in ci/cd, deployment scripts