Skip to content

Commit 69d90f0

Browse files
authored
Merge pull request #223 from codeforjapan/fix/search-language-normalization
Fix/search language normalization
2 parents 1cd0a11 + 9cbb2ba commit 69d90f0

File tree

2 files changed

+123
-4
lines changed

2 files changed

+123
-4
lines changed

common/birdxplorer_common/storage.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from typing import Any, Generator, List, Optional, Tuple, Union
23

34
from psycopg2.extensions import AsIs, register_adapter
@@ -62,6 +63,14 @@ def adapt_pydantic_http_url(url: AnyUrl) -> AsIs:
6263

6364
register_adapter(AnyUrl, adapt_pydantic_http_url)
6465

66+
_POST_ID_PATTERN = re.compile(r"^([0-9]{1,19}|)$")
67+
68+
69+
def _normalize_post_id(value: Optional[str]) -> str:
70+
if value is None or not _POST_ID_PATTERN.match(value):
71+
return ""
72+
return value
73+
6574

6675
class Base(DeclarativeBase):
6776
type_annotation_map = {
@@ -1329,7 +1338,7 @@ def get_notes(
13291338
yield NoteModel(
13301339
note_id=note_record.note_id,
13311340
note_author_participant_id=note_record.note_author_participant_id,
1332-
post_id=note_record.post_id,
1341+
post_id=_normalize_post_id(note_record.post_id),
13331342
topics=[
13341343
TopicModel(
13351344
topic_id=topic.topic_id,
@@ -1612,7 +1621,7 @@ def search_notes_with_posts(
16121621
note = NoteModel(
16131622
note_id=note_record.note_id,
16141623
note_author_participant_id=note_record.note_author_participant_id,
1615-
post_id=note_record.post_id,
1624+
post_id=_normalize_post_id(note_record.post_id),
16161625
topics=[
16171626
TopicModel(
16181627
topic_id=topic.topic_id,
@@ -1621,7 +1630,11 @@ def search_notes_with_posts(
16211630
)
16221631
for topic in note_record.topics
16231632
],
1624-
language=note_record.language,
1633+
language=(
1634+
LanguageIdentifier.normalize(note_record.language)
1635+
if note_record.language
1636+
else LanguageIdentifier.OTHER
1637+
),
16251638
summary=note_record.summary,
16261639
current_status=note_record.current_status,
16271640
created_at=note_record.created_at,

common/tests/test_search.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
from typing import List
22

33
from sqlalchemy.engine import Engine
4+
from sqlalchemy.orm import Session
5+
from sqlalchemy.sql import text
46

57
from birdxplorer_common.models import LanguageIdentifier, Note, Post, TopicId
6-
from birdxplorer_common.storage import NoteRecord, PostRecord, Storage, TopicRecord
8+
from birdxplorer_common.storage import (
9+
NoteRecord,
10+
PostRecord,
11+
Storage,
12+
TopicRecord,
13+
XUserRecord,
14+
)
715

816

917
def test_basic_search(
@@ -178,3 +186,101 @@ def test_count_search_results(
178186
# Verify count matches actual results
179187
results = list(storage.search_notes_with_posts(note_includes_text="summary", language=LanguageIdentifier("en")))
180188
assert len(results) == filtered_count
189+
190+
191+
def test_search_notes_with_non_enum_language(
192+
engine_for_test: Engine,
193+
note_samples: List[Note],
194+
post_samples: List[Post],
195+
note_records_sample: List[NoteRecord],
196+
x_user_records_sample: List[XUserRecord],
197+
post_records_sample: List[PostRecord],
198+
) -> None:
199+
"""Test that notes with non-enum language codes (e.g. 'ko') are returned with language='other' instead of skipped"""
200+
with Session(engine_for_test) as sess:
201+
# Insert a note with Korean language (not in LanguageIdentifier enum)
202+
sess.execute(
203+
text(
204+
"INSERT INTO notes (note_id, post_id, summary, language, created_at) "
205+
"VALUES (:note_id, :post_id, :summary, :language, :created_at)"
206+
),
207+
{
208+
"note_id": "9999999999999999901",
209+
"post_id": "2234567890123456781",
210+
"summary": "Korean language note summary",
211+
"language": "ko",
212+
"created_at": 1152921600000,
213+
},
214+
)
215+
sess.commit()
216+
217+
storage = Storage(engine=engine_for_test)
218+
results = list(storage.search_notes_with_posts(note_includes_text="Korean language note"))
219+
assert len(results) == 1
220+
note, _ = results[0]
221+
assert note.language == "other"
222+
223+
224+
def test_search_notes_with_null_language(
225+
engine_for_test: Engine,
226+
note_samples: List[Note],
227+
post_samples: List[Post],
228+
note_records_sample: List[NoteRecord],
229+
x_user_records_sample: List[XUserRecord],
230+
post_records_sample: List[PostRecord],
231+
) -> None:
232+
"""Test that notes with NULL language are returned with language='other' instead of skipped"""
233+
with Session(engine_for_test) as sess:
234+
# Insert a note with NULL language
235+
sess.execute(
236+
text(
237+
"INSERT INTO notes (note_id, post_id, summary, language, created_at) "
238+
"VALUES (:note_id, :post_id, :summary, :language, :created_at)"
239+
),
240+
{
241+
"note_id": "9999999999999999902",
242+
"post_id": "2234567890123456781",
243+
"summary": "Null language note summary",
244+
"language": None,
245+
"created_at": 1152921600000,
246+
},
247+
)
248+
sess.commit()
249+
250+
storage = Storage(engine=engine_for_test)
251+
results = list(storage.search_notes_with_posts(note_includes_text="Null language note"))
252+
assert len(results) == 1
253+
note, _ = results[0]
254+
assert note.language == "other"
255+
256+
257+
def test_search_notes_with_invalid_post_id(
258+
engine_for_test: Engine,
259+
note_samples: List[Note],
260+
post_samples: List[Post],
261+
note_records_sample: List[NoteRecord],
262+
x_user_records_sample: List[XUserRecord],
263+
post_records_sample: List[PostRecord],
264+
) -> None:
265+
"""Test that notes with invalid post_id (e.g. '-1') are returned with post_id='' instead of skipped"""
266+
with Session(engine_for_test) as sess:
267+
sess.execute(
268+
text(
269+
"INSERT INTO notes (note_id, post_id, summary, language, created_at) "
270+
"VALUES (:note_id, :post_id, :summary, :language, :created_at)"
271+
),
272+
{
273+
"note_id": "9999999999999999903",
274+
"post_id": "-1",
275+
"summary": "Invalid post id note summary",
276+
"language": "en",
277+
"created_at": 1152921600000,
278+
},
279+
)
280+
sess.commit()
281+
282+
storage = Storage(engine=engine_for_test)
283+
results = list(storage.search_notes_with_posts(note_includes_text="Invalid post id note"))
284+
assert len(results) == 1
285+
note, _ = results[0]
286+
assert note.post_id == ""

0 commit comments

Comments
 (0)