From aadde38f7d8b5191327dd18a4f1061e2e42b6a2e Mon Sep 17 00:00:00 2001 From: Ryo Igarashi Date: Sat, 30 Mar 2024 14:56:41 +0900 Subject: [PATCH] Add Python binding --- .editorconfig | 25 ++++ .envrc | 2 + .github/workflows/python.yaml | 126 ++++++++++++++++++ .github/workflows/rust.yaml | 10 +- .github/workflows/rust_release.yaml | 9 +- .gitignore | 2 + .vscode/settings.json | 23 ++++ ainu-utils-python/Cargo.toml | 5 +- ainu-utils-python/ainu_utils.pyi | 4 + ainu-utils-python/pyproject.toml | 21 ++- ainu-utils-python/src/lib.rs | 9 +- ainu-utils-python/tests/test_tokenize.py | 6 + ainu-utils/examples/tokenize.rs | 4 +- ainu-utils/src/lib.rs | 2 +- ainu-utils/src/{tokenizer.rs => segmenter.rs} | 48 +++---- flake.nix | 1 + 16 files changed, 250 insertions(+), 47 deletions(-) create mode 100644 .editorconfig create mode 100644 .github/workflows/python.yaml create mode 100644 .vscode/settings.json create mode 100644 ainu-utils-python/ainu_utils.pyi create mode 100644 ainu-utils-python/tests/test_tokenize.py rename ainu-utils/src/{tokenizer.rs => segmenter.rs} (74%) diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..38f8163 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,25 @@ +# EditorConfig helps developers define and maintain consistent coding styles between different editors and IDEs + +# Top-most EditorConfig file +root = true + +# Python files +[*.py] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.rs, *.js] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.{yaml,json,toml}] +indent_style = space +indent_size = 2 diff --git a/.envrc b/.envrc index 3550a30..8e34683 100644 --- a/.envrc +++ b/.envrc @@ -1 +1,3 @@ use flake + +source .venv/bin/activate diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml new file mode 100644 index 0000000..bef7171 --- /dev/null +++ b/.github/workflows/python.yaml @@ -0,0 +1,126 @@ +# This file is autogenerated by maturin v1.5.0 +# To update, run +# +# maturin generate-ci github -m ./ainu-utils-python/Cargo.toml -o ./.github/workflows/python.yaml --platform linux macos --pytest +# +name: CI + +on: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ubuntu-latest + strategy: + matrix: + target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --find-interpreter --manifest-path ./ainu-utils-python/Cargo.toml + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.target }} + path: dist + - name: pytest + if: ${{ startsWith(matrix.target, 'x86_64') }} + shell: bash + run: | + set -e + pip install ainu-utils --find-links dist --force-reinstall + pip install pytest + cd ./ainu-utils-python && pytest + - name: pytest + if: ${{ !startsWith(matrix.target, 'x86') && matrix.target != 'ppc64' }} + uses: uraimo/run-on-arch-action@v2.5.0 + with: + arch: ${{ matrix.target }} + distro: ubuntu22.04 + githubToken: ${{ github.token }} + install: | + apt-get update + apt-get install -y --no-install-recommends python3 python3-pip + pip3 install -U pip pytest + run: | + set -e + pip3 install ainu-utils --find-links dist --force-reinstall + cd ./ainu-utils-python && pytest + + macos: + runs-on: macos-latest + strategy: + matrix: + target: [x86_64, aarch64] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --find-interpreter --manifest-path ./ainu-utils-python/Cargo.toml + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.target }} + path: dist + - name: pytest + if: ${{ !startsWith(matrix.target, 'aarch64') }} + shell: bash + run: | + set -e + pip install ainu-utils --find-links dist --force-reinstall + pip install pytest + cd ./ainu-utils-python && pytest + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist --manifest-path ./ainu-utils-python/Cargo.toml + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: "startsWith(github.ref, 'refs/tags/')" + needs: [linux, macos, sdist] + steps: + - uses: actions/download-artifact@v4 + - name: Publish to PyPI + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/rust.yaml b/.github/workflows/rust.yaml index 1dfc280..ff88e85 100644 --- a/.github/workflows/rust.yaml +++ b/.github/workflows/rust.yaml @@ -8,16 +8,18 @@ on: pull_request: branches: - "*" + paths: + - "ainu-utils/**" + +defaults: + run: + working-directory: ainu-utils # https://github.com/taiki-e/cargo-llvm-cov/tree/main?tab=readme-ov-file#continuous-integration jobs: test: runs-on: ubuntu-latest - defaults: - run: - working-directory: ainu-utils - steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/rust_release.yaml b/.github/workflows/rust_release.yaml index 22bc755..47a9d53 100644 --- a/.github/workflows/rust_release.yaml +++ b/.github/workflows/rust_release.yaml @@ -8,16 +8,15 @@ on: push: tags: - v* - workflow_dispatch: + +defaults: + run: + working-directory: ainu-utils jobs: publish: runs-on: ubuntu-latest - defaults: - run: - working-directory: ainu-utils - steps: - name: Checkout repository uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index 6985cf1..22bfb5b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ Cargo.lock # MSVC Windows builds of rustc generate these, which store debugging information *.pdb + +.venv diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..bf61016 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,23 @@ +{ + "cSpell.enabled": true, + + "files.exclude": { + "**/.venv": true, + "**/.mypy_cache": true, + "**/.ruff_cache": true, + "**/.pytest_cache": true, + "**/*.egg-info": true, + "**/__pycache__": true + }, + + // https://github.com/astral-sh/ruff-vscode?tab=readme-ov-file#configuring-vs-code + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.codeActionsOnSave": { + "source.fixAll": "always" + } + }, + + // https://github.com/microsoft/vscode-mypy/issues/157 + "mypy-type-checker.reportingScope": "workspace" +} diff --git a/ainu-utils-python/Cargo.toml b/ainu-utils-python/Cargo.toml index ab89119..21a67df 100644 --- a/ainu-utils-python/Cargo.toml +++ b/ainu-utils-python/Cargo.toml @@ -4,9 +4,10 @@ version = "0.1.0" edition = "2021" [lib] -name = "ainu_utils_python" +name = "ainu_utils" crate-type = ["cdylib"] [dependencies] -pyo3 = "0.19.0" +pyo3 = "0.20.0" ainu-utils = { path = "../ainu-utils" } + diff --git a/ainu-utils-python/ainu_utils.pyi b/ainu-utils-python/ainu_utils.pyi new file mode 100644 index 0000000..b5fcdb9 --- /dev/null +++ b/ainu-utils-python/ainu_utils.pyi @@ -0,0 +1,4 @@ +# https://www.maturin.rs/project_layout#adding-python-type-information +def segment(text: str) -> list[str]: ... + +test_number: int diff --git a/ainu-utils-python/pyproject.toml b/ainu-utils-python/pyproject.toml index b1ed7d2..4c2ecb4 100644 --- a/ainu-utils-python/pyproject.toml +++ b/ainu-utils-python/pyproject.toml @@ -1,11 +1,22 @@ -[project] -name = "ainu-utils-python" -requires-python = ">=3.8" -dynamic = ["version"] - [build-system] requires = ["maturin>=1.4,<2.0"] build-backend = "maturin" [tool.maturin] features = ["pyo3/extension-module"] + +[project] +name = "ainu-utils" +description = "A collection of utility for with the Ainu language" +requires-python = ">=3.8" +version = "0.1.0" +license = "MIT" + +[project.optional-dependencies] +test = [ + "pytest==8.1.1", +] +dev = [ + "ruff==0.3.3", + "mypy==1.9.0", +] diff --git a/ainu-utils-python/src/lib.rs b/ainu-utils-python/src/lib.rs index 05a5153..d84a3ee 100644 --- a/ainu-utils-python/src/lib.rs +++ b/ainu-utils-python/src/lib.rs @@ -1,14 +1,15 @@ -extern crate ainu_utils as ainu; +extern crate ainu_utils as ainu_utils_rust; use pyo3::prelude::*; #[pyfunction] -fn tokenize(text: &str) -> Vec { - ainu::tokenizer::tokenize(text) +fn segment(text: &str) -> Vec { + ainu_utils_rust::segmenter::segment(text) } #[pymodule] fn ainu_utils(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(tokenize, m)?)?; + m.add_function(wrap_pyfunction!(segment, m)?)?; + m.add("test_number", 123)?; Ok(()) } diff --git a/ainu-utils-python/tests/test_tokenize.py b/ainu-utils-python/tests/test_tokenize.py new file mode 100644 index 0000000..c1dee79 --- /dev/null +++ b/ainu-utils-python/tests/test_tokenize.py @@ -0,0 +1,6 @@ +import ainu_utils + + +def test_tokenize(): + result = ainu_utils.segment("irankarapte. e=iwanke ya?") + assert result == ["irankarapte", ".", "e=", "iwanke", "ya", "?"] diff --git a/ainu-utils/examples/tokenize.rs b/ainu-utils/examples/tokenize.rs index 5cf9cc9..f85b870 100644 --- a/ainu-utils/examples/tokenize.rs +++ b/ainu-utils/examples/tokenize.rs @@ -1,11 +1,11 @@ -use ainu_utils::tokenizer::tokenize; +use ainu_utils::segmenter::segment; use std::env; fn main() { let args: Vec = env::args().collect(); let text = &args[1]; - let tokens = tokenize(text); + let tokens = segment(text); println!("{:?}", tokens); } diff --git a/ainu-utils/src/lib.rs b/ainu-utils/src/lib.rs index 37773ed..90bcb48 100644 --- a/ainu-utils/src/lib.rs +++ b/ainu-utils/src/lib.rs @@ -1,2 +1,2 @@ pub mod number; -pub mod tokenizer; +pub mod segmenter; diff --git a/ainu-utils/src/tokenizer.rs b/ainu-utils/src/segmenter.rs similarity index 74% rename from ainu-utils/src/tokenizer.rs rename to ainu-utils/src/segmenter.rs index b6f3f14..6e1680b 100644 --- a/ainu-utils/src/tokenizer.rs +++ b/ainu-utils/src/segmenter.rs @@ -15,45 +15,45 @@ static SUFFIX_REGEX: Lazy = Lazy::new(|| Regex::new(&format!(r"(?\w+)(?{})$", SUFFIXES.join("|"))).unwrap()); fn parse_affix(token: String) -> Vec { - let mut tokens = Vec::new(); + let mut words = Vec::new(); if let Some(caps) = PREFIX_REGEX.captures(&token) { - tokens.push(caps["prefix"].to_string()); - tokens.push(caps["word"].to_string()); + words.push(caps["prefix"].to_string()); + words.push(caps["word"].to_string()); } else if let Some(caps) = SUFFIX_REGEX.captures(&token) { - tokens.push(caps["word"].to_string()); - tokens.push(caps["suffix"].to_string()); + words.push(caps["word"].to_string()); + words.push(caps["suffix"].to_string()); } else { - tokens.push(token); + words.push(token); } - tokens + words } -pub fn tokenize(text: &str) -> Vec { - let mut tokens = Vec::new(); - let mut token = String::new(); +pub fn segment(text: &str) -> Vec { + let mut words = Vec::new(); + let mut word = String::new(); for c in text.chars() { if c.is_alphabetic() || c == '=' { - token.push(c); + word.push(c); } else { - if !token.is_empty() { - tokens.extend(parse_affix(token)); - token = String::new(); + if !word.is_empty() { + words.extend(parse_affix(word)); + word = String::new(); } if !c.is_whitespace() { - tokens.push(c.to_string()); + words.push(c.to_string()); } } } - if !token.is_empty() { - tokens.extend(parse_affix(token)); + if !word.is_empty() { + words.extend(parse_affix(word)); } - tokens + words } #[cfg(test)] @@ -61,9 +61,9 @@ mod tests { use super::*; #[test] - fn test_tokenize() { + fn test_segment() { let text = "irankarapte! eyami yak a=ye aeywankep ku=kar wa k=an."; - let tokens = tokenize(text); + let tokens = segment(text); assert_eq!( tokens, @@ -86,9 +86,9 @@ mod tests { } #[test] - fn test_tokenize_suffix() { + fn test_segment_suffix() { let text = "soyenpa=an wa sinot=an ro!"; - let tokens = tokenize(text); + let tokens = segment(text); assert_eq!( tokens, @@ -99,7 +99,7 @@ mod tests { #[test] fn test_sentence_does_not_end_with_period() { let text = "a=nukar hike i=yaykohaytare i=yaypokaste wa iki pe"; - let tokens = tokenize(text); + let tokens = segment(text); assert_eq!( tokens, @@ -121,7 +121,7 @@ mod tests { #[test] fn test_sentence_ending_with_a_fixed_word() { let text = "neno a=ye itak pirka a=ye itak i=koynu wa ... i=konu wa i=kore"; - let tokens = tokenize(text); + let tokens = segment(text); assert_eq!( tokens, diff --git a/flake.nix b/flake.nix index 2fc6e0d..daad725 100644 --- a/flake.nix +++ b/flake.nix @@ -16,6 +16,7 @@ openssl pkg-config python311 + maturin rustup wasm-pack ];