Skip to content

Commit

Permalink
Merge pull request #7 from life4/no-std
Browse files Browse the repository at this point in the history
no_std support
  • Loading branch information
orsinium authored Sep 11, 2024
2 parents bc80088 + afdf1e8 commit 813bb70
Show file tree
Hide file tree
Showing 31 changed files with 479 additions and 535 deletions.
776 changes: 337 additions & 439 deletions Cargo.lock

Large diffs are not rendered by default.

21 changes: 10 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
[package]
name = "textdistance"
version = "1.0.2"
version = "1.1.0"
edition = "2021"
authors = ["Gram <[email protected]>"]
description = "Lots of algorithms to compare how similar two sequences are"
repository = "https://github.com/life4/textdistance.rs"
license = "MIT"
keywords = [
"jaro",
"hamming",
"levenshtein",
"similarity",
"distance",
]
keywords = ["jaro", "hamming", "levenshtein", "similarity", "distance"]
categories = [
"algorithms",
"science",
"no-std",
"text-processing",
"command-line-interface",
]

[features]
default = ["std"]
std = []

[dev-dependencies]
assert2 = "0.3.10"
criterion = "0.4.0"
assert2 = "0.3.15"
criterion = "0.5.1"
proptest = "1.1.0"
rstest = "0.17.0"
rstest = "0.22.0"
unicode-segmentation = "1.10.1"

[[bench]]
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Features:
+ 📚 Contains 20+ algorithms for all purposes.
+ 🔬 Includes state-of-the-art algorithms like `EntropyNCD` and `Sift4`.
+ 🪶 Zero-dependency.
+ 🐜 `#![no_std]` support (embedded systems).
+ 🔨 Works with any iterators, including bytes, code points, Unicode grapheme clusters, words, and numbers.
+ ❤️ Friendly and consistent API for all algorithms.
+ 📏 Optional normalization of the result on the 0.0-1.0 interval.
Expand Down Expand Up @@ -66,6 +67,12 @@ Normalization for other metrics:
cargo add textdistance
```

Or if you're going to use it in a [no_std](https://docs.rust-embedded.org/book/intro/no-std.html) project:

```shell
cargo add --no-default-features textdistance
```

## Usage

The `textdistance::str` module provides shortcut functions for each algorithm for calculating the distance/similarity between two strings:
Expand Down
1 change: 1 addition & 0 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ tasks:
CLICOLOR_FORCE: "yes"
cmds:
- cargo nextest run --no-fail-fast {{.CLI_ARGS}}
- cargo build --no-default-features

doctest:
cmds:
Expand Down
2 changes: 1 addition & 1 deletion benches/str_benchmarks.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use core::time::Duration;
use criterion::BenchmarkId;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::fs;
use std::time::Duration;
use textdistance::{nstr, str};

fn read_licenses() -> Vec<(String, String)> {
Expand Down
3 changes: 2 additions & 1 deletion src/algorithm.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::Result;
use std::hash::Hash;
use alloc::vec::Vec;
use core::hash::Hash;

/// A base trait for all distance/similarity algorithms.
///
Expand Down
4 changes: 2 additions & 2 deletions src/algorithms/bag.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! Bag distance
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -13,7 +13,7 @@ impl Algorithm<usize> for Bag {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<usize>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/cosine.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Cosine similarity
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -15,7 +16,7 @@ impl Algorithm<f64> for Cosine {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
5 changes: 4 additions & 1 deletion src/algorithms/damerau_levenshtein.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
//! Damerau-Levenshtein distance
#![cfg(feature = "std")]
use crate::{Algorithm, Result};
use alloc::vec;
use alloc::vec::Vec;
use core::hash::Hash;
use std::collections::HashMap;
use std::hash::Hash;

/// [Damerau-Levenshtein distance] is an edit distance between two sequences.
///
Expand Down
5 changes: 3 additions & 2 deletions src/algorithms/entropy_ncd.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
//! Entropy-based Normalized Compression Distance
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};
use std::hash::Hash;
use core::hash::Hash;

/// Entropy-based [Normalized Compression Distance].
///
Expand Down Expand Up @@ -46,7 +47,7 @@ impl Algorithm<f64> for EntropyNCD {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/jaccard.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Jaccard index
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -17,7 +18,7 @@ impl Algorithm<f64> for Jaccard {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
1 change: 1 addition & 0 deletions src/algorithms/jaro.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Jaro similarity
use crate::{Algorithm, Result};
use alloc::vec;

/// [Jaro similarity] is calculated based on the number of transpositions to turn one string into the other.
///
Expand Down
4 changes: 2 additions & 2 deletions src/algorithms/jaro_winkler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ impl JaroWinkler {
fn winklerize<C, E>(&self, jaro: f64, s1: C, s2: C) -> f64
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
debug_assert!(self.prefix_weight * self.max_prefix as f64 <= 1.0);
let mut prefix_len = 0;
Expand All @@ -55,7 +55,7 @@ impl JaroWinkler {
impl Algorithm<f64> for JaroWinkler {
fn for_vec<E>(&self, s1: &[E], s2: &[E]) -> Result<f64>
where
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let jaro = self.jaro.for_vec(s1, s2).nval();
Result {
Expand Down
2 changes: 2 additions & 0 deletions src/algorithms/lcsseq.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Longest common subsequence
use crate::{Algorithm, Result};
use alloc::vec;
use alloc::vec::Vec;

/// The length of the [Longest common subsequence].
///
Expand Down
1 change: 1 addition & 0 deletions src/algorithms/lcsstr.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Longest common substring
use crate::{Algorithm, Result};
use alloc::vec;

/// The length of the [Longest common substring].
///
Expand Down
1 change: 1 addition & 0 deletions src/algorithms/levenshtein.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Levenshtein distance
use crate::{Algorithm, Result};
use alloc::vec::Vec;

/// [Levenshtein distance] is an edit distance between two sequences.
///
Expand Down
2 changes: 1 addition & 1 deletion src/algorithms/lig3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use super::hamming::Hamming;
use super::levenshtein::Levenshtein;
use crate::{Algorithm, Result};
use std::hash::Hash;
use core::hash::Hash;

/// [LIG3 similarity] is a normalization of [`Hamming`] by [`Levenshtein`].
///
Expand Down
2 changes: 1 addition & 1 deletion src/algorithms/mlipns.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! MLIPNS similarity
use super::hamming::Hamming;
use crate::{Algorithm, Result};
use std::hash::Hash;
use core::hash::Hash;

/// [MLIPNS similarity] is a normalization for [`Hamming`] that returns either 0 or 1.
///
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/overlap.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Overlap coefficient
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -12,7 +13,7 @@ impl Algorithm<f64> for Overlap {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
5 changes: 3 additions & 2 deletions src/algorithms/ratcliff_obershelp.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Gestalt pattern matching
use crate::{Algorithm, Result};
use alloc::vec;
use alloc::vec::Vec;

/// [Ratcliff/Obershelp similarity] is [`LCSStr`] that recursively finds matches
/// on both sides of the longest substring.
Expand All @@ -23,8 +25,7 @@ impl Algorithm<usize> for RatcliffObershelp {
stack.push(((0, l1), (0, l2)));
let mut result = 0;

while !stack.is_empty() {
let top = stack.pop().unwrap();
while let Some(top) = stack.pop() {
let ((part1_start, part1_len), (part2_start, part2_len)) = top;
let s1_part = s1[part1_start..(part1_start + part1_len)].iter();
let s2_part: Vec<&E> = s2[part2_start..(part2_start + part2_len)].iter().collect();
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/roberts.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Roberts similarity
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -14,7 +15,7 @@ impl Algorithm<f64> for Roberts {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/sift4_common.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Sift4 distance
use crate::{Algorithm, Result};
use alloc::vec::Vec;

/// [Sift4 distance] is an edit algorithm designed to be "fast and relatively accurate".
///
Expand Down Expand Up @@ -44,7 +45,7 @@ impl Algorithm<usize> for Sift4Common {
let mut lcss = 0; // largest common subsequence
let mut local_cs = 0; // local common substring
let mut trans = 0; // number of transpositions ('ab' vs 'ba')
let mut offset_arr: Vec<Offset> = vec![]; // offset pair array, for computing the transpositions
let mut offset_arr: Vec<Offset> = Vec::new(); // offset pair array, for computing the transpositions
while (c1 < l1) && (c2 < l2) {
if s1[c1] == s2[c2] {
local_cs += 1;
Expand Down
2 changes: 2 additions & 0 deletions src/algorithms/smith_waterman.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Smith-Waterman sequence alignment
use crate::{Algorithm, Result};
use alloc::vec;
use alloc::vec::Vec;

/// [Smith-Waterman similarity] is edit-based and designed for nucleic acid (and protein) sequences.
///
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/sorensen_dice.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Sørensen-Dice coefficient
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand All @@ -12,7 +13,7 @@ impl Algorithm<f64> for SorensenDice {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/tversky.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Tversky index
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};

Expand Down Expand Up @@ -30,7 +31,7 @@ impl Algorithm<f64> for Tversky {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
Expand Down
2 changes: 1 addition & 1 deletion src/algorithms/yujian_bo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ impl Algorithm<f64> for YujianBo {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + std::hash::Hash,
E: Eq + core::hash::Hash,
{
let lev = self.levenshtein.for_iter(s1, s2);
let dc: usize = self.levenshtein.del_cost;
Expand Down
7 changes: 4 additions & 3 deletions src/counter.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#![cfg(feature = "std")]
use core::hash::Hash;
use std::collections::HashMap;
use std::hash::Hash;

/// Multiset container inspired by Python's `collections.Counter`.
pub struct Counter<K> {
Expand Down Expand Up @@ -66,7 +67,7 @@ where
result.insert(key, *lhs_count + rhs_count);
}
for (key, rhs_count) in &rhs.map {
if self.map.get(key).is_none() {
if !self.map.contains_key(key) {
result.insert(key, *rhs_count);
}
}
Expand All @@ -92,7 +93,7 @@ where
result += lhs_count.max(rhs_count);
}
for (key, rhs_count) in &rhs.map {
if self.map.get(key).is_none() {
if !self.map.contains_key(key) {
result += rhs_count;
}
}
Expand Down
Loading

0 comments on commit 813bb70

Please sign in to comment.