From 1d3e1e6d29a3cd922b68b432e09baace8e07e1f9 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 23 Dec 2024 21:15:17 +0700 Subject: [PATCH 1/2] Add pythainlp.llm.remove_repeated_ngrams --- docs/api/llm.rst | 9 +++++++ pythainlp/llm/__init__.py | 8 ++++++ pythainlp/llm/text_util.py | 51 ++++++++++++++++++++++++++++++++++++++ tests/core/test_llm.py | 21 ++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 docs/api/llm.rst create mode 100644 pythainlp/llm/__init__.py create mode 100644 pythainlp/llm/text_util.py create mode 100644 tests/core/test_llm.py diff --git a/docs/api/llm.rst b/docs/api/llm.rst new file mode 100644 index 000000000..db058504c --- /dev/null +++ b/docs/api/llm.rst @@ -0,0 +1,9 @@ +.. currentmodule:: pythainlp.llm + +pythainlp.llm +============= + +Modules +------- + +.. autofunction:: remove_repeated_ngrams \ No newline at end of file diff --git a/pythainlp/llm/__init__.py b/pythainlp/llm/__init__.py new file mode 100644 index 000000000..1504cd266 --- /dev/null +++ b/pythainlp/llm/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +__all__ = ["remove_repeated_ngrams"] + +from pythainlp.llm.text_util import remove_repeated_ngrams \ No newline at end of file diff --git a/pythainlp/llm/text_util.py b/pythainlp/llm/text_util.py new file mode 100644 index 000000000..6b2742170 --- /dev/null +++ b/pythainlp/llm/text_util.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: C901 + +from typing import List + + +def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]: + """ + Remove repeated n-grams + + :param List[str] string_list: List of string + :param int n: n-gram size + :return: List of string + :rtype: List[str] + + :Example: + :: + + from pythainlp.llm import remove_repeated_ngrams + + remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1) + # output: ['เอา', 'แบบ', 'ไหน'] + """ + if not string_list or n <= 0: + return string_list + + unique_ngrams = set() + + output_list = [] + + for i in range(len(string_list)): + if i + n <= len(string_list): + ngram = tuple(string_list[i:i+n]) + + if ngram not in unique_ngrams: + unique_ngrams.add(ngram) + + if not output_list or output_list[-(n-1):]!= list(ngram[:-1]): + output_list.extend(ngram) + else: + output_list.append(ngram[-1]) + else: + for char in string_list[i:]: + if not output_list or output_list[-1]!= char: + output_list.append(char) + + return output_list + diff --git a/tests/core/test_llm.py b/tests/core/test_llm.py new file mode 100644 index 000000000..415bc84d3 --- /dev/null +++ b/tests/core/test_llm.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +import unittest + +from pythainlp.llm import remove_repeated_ngrams + + +class LlmTestCase(unittest.TestCase): + def test_remove_repeated_ngrams(self): + texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน'] + self.assertEqual( + remove_repeated_ngrams(texts, n=1), + ['เอา', 'แบบ', 'ไหน'] + ) + self.assertEqual( + remove_repeated_ngrams(texts, n=2), + ['เอา', 'เอา', 'แบบ', 'แบบ', 'ไหน'] + ) From 4d465d27f95128fcdbb448b5ef52fac3e49b8e39 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Fri, 27 Dec 2024 17:07:28 +0700 Subject: [PATCH 2/2] Fixed pep8 --- pythainlp/llm/__init__.py | 2 +- pythainlp/llm/text_util.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pythainlp/llm/__init__.py b/pythainlp/llm/__init__.py index 1504cd266..bac77a1cc 100644 --- a/pythainlp/llm/__init__.py +++ b/pythainlp/llm/__init__.py @@ -5,4 +5,4 @@ __all__ = ["remove_repeated_ngrams"] -from pythainlp.llm.text_util import remove_repeated_ngrams \ No newline at end of file +from pythainlp.llm.text_util import remove_repeated_ngrams diff --git a/pythainlp/llm/text_util.py b/pythainlp/llm/text_util.py index 6b2742170..6252cf91a 100644 --- a/pythainlp/llm/text_util.py +++ b/pythainlp/llm/text_util.py @@ -33,19 +33,18 @@ def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]: for i in range(len(string_list)): if i + n <= len(string_list): - ngram = tuple(string_list[i:i+n]) + ngram = tuple(string_list[i:i + n]) if ngram not in unique_ngrams: unique_ngrams.add(ngram) - if not output_list or output_list[-(n-1):]!= list(ngram[:-1]): + if not output_list or output_list[-(n - 1):] != list(ngram[:-1]): output_list.extend(ngram) else: output_list.append(ngram[-1]) else: for char in string_list[i:]: - if not output_list or output_list[-1]!= char: + if not output_list or output_list[-1] != char: output_list.append(char) return output_list -