Skip to content

Commit

Permalink
Merge pull request #1059 from PyThaiNLP/wannaphong/add-lcs
Browse files Browse the repository at this point in the history
Add longest common subsequence algorithm
  • Loading branch information
wannaphong authored Jan 13, 2025
2 parents ef0e01d + 4c5c948 commit c4de24c
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,11 @@ Modules

The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.

.. autofunction:: longest_common_subsequence
:noindex:

The `longest_common_subsequence` function is find the longest common subsequence between two strings.

.. autofunction:: pythainlp.util.morse.morse_encode
:noindex:

Expand Down
4 changes: 3 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
Expand Down Expand Up @@ -26,6 +26,7 @@
"is_native_thai",
"isthai",
"isthaichar",
"longest_common_subsequence",
"nectec_to_ipa",
"normalize",
"now_reign_year",
Expand Down Expand Up @@ -92,6 +93,7 @@
thai_to_eng,
)
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.lcs import longest_common_subsequence
from pythainlp.util.normalize import (
maiyamok,
normalize,
Expand Down
67 changes: 67 additions & 0 deletions pythainlp/util/lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

def longest_common_subsequence(str1: str, str2: str) -> str:
"""
Find the longest common subsequence between two strings.
:param str str1: The first string.
:param str str2: The second string.
:return: The longest common subsequence.
:rtype: str
:Example:
::
from pythainlp.util.lcs import longest_common_subsequence
print(longest_common_subsequence("ABCBDAB", "BDCAB"))
# output: "BDAB"
"""
m = len(str1)
n = len(str2)

# Create a 2D array to store lengths of longest common subsequence.
dp = [[0] * (n + 1) for _ in range(m + 1)]

# Build the dp array from bottom up.
for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
dp[i][j] = 0
elif str1[i - 1] == str2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

# Following code is used to print LCS
index = dp[m][n]

# Create a character array to store the lcs string
lcs = [""] * (index + 1)
lcs[index] = ""

# Start from the right-most-bottom-most corner and
# one by one store characters in lcs[]
i = m
j = n
while i > 0 and j > 0:

# If current character in str1 and str2 are same, then
# current character is part of LCS
if str1[i - 1] == str2[j - 1]:
lcs[index - 1] = str1[i - 1]
i -= 1
j -= 1
index -= 1

# If not same, then find the larger of two and
# go in the direction of larger value
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1

return "".join(lcs)
11 changes: 11 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
ipa_to_rtgs,
isthai,
isthaichar,
longest_common_subsequence,
nectec_to_ipa,
normalize,
now_reign_year,
Expand Down Expand Up @@ -842,3 +843,13 @@ def test_th_zodiac(self):

# def test_abbreviation_to_full_text(self):
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

def test_longest_common_subsequence(self):
self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")
self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH")
self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC")
self.assertEqual(longest_common_subsequence("ABC", "DEF"), "")
self.assertEqual(longest_common_subsequence("", "ABC"), "")
self.assertEqual(longest_common_subsequence("ABC", ""), "")
self.assertEqual(longest_common_subsequence("", ""), "")

0 comments on commit c4de24c

Please sign in to comment.