Skip to content

Commit 75205d5

Browse files
committed
add notebook
and dialyxir
1 parent ad3c8c3 commit 75205d5

File tree

11 files changed

+7702
-7567
lines changed

11 files changed

+7702
-7567
lines changed

.tool-versions

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
elixir 1.12.3-otp-24
2-
24.1.2
1+
elixir 1.15.5-otp-26
2+
erlang 26.0.2

config/config.exs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# This file is responsible for configuring your application
22
# and its dependencies with the aid of the Mix.Config module.
3-
use Mix.Config
3+
import Config
44

55
# This configuration is loaded before any dependency and is restricted
66
# to this project. If another project depends on this project, this

lib/akin.ex

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ defmodule Akin do
33
Akin
44
=======
55
6-
Functions for comparing two strings for similarity using a collection of string comparison algorithms for Elixir. Algorithms can be called independently or in total to return a map of metrics.
6+
Functions for comparing two strings for similarity using a collection of string comparison algorithms for Elixir.
7+
Algorithms can be called independently or in total to return a map of metrics.
78
89
## Options
910
@@ -32,7 +33,7 @@ defmodule Akin do
3233
alias Akin.Corpus
3334
alias Akin.Names
3435

35-
@spec compare(binary() | %Corpus{}, binary() | %Corpus{}, keyword()) :: float()
36+
@spec compare(binary() | %Corpus{}, binary() | %Corpus{}, keyword()) :: map()
3637
@doc """
3738
Compare two strings. Return map of algorithm metrics.
3839
@@ -69,6 +70,14 @@ defmodule Akin do
6970
@doc """
7071
Compare a string against a list of strings. Matches are determined by algorithem metrics equal to or higher than the
7172
`match_at` option. Return a list of strings that are a likely match.
73+
74+
Future Plans
75+
* _if_ the name part is an initial, give the `initials` score its weight, otherwise reduce it
76+
* if the `initials` score is significantly higher than the average of the others, reduce the `initials` score to the average of the others
77+
* add options
78+
* "use_average", "top_three", and/or "average_of_top_three"
79+
* "group" to results into strong matches and weak matches
80+
* "details" to include the scores in the result list
7281
"""
7382
def match_names(left, rights, opts \\ default_opts())
7483

@@ -90,7 +99,7 @@ defmodule Akin do
9099
end)
91100
end
92101

93-
@spec match_names_metrics(binary(), list(), keyword()) :: float()
102+
@spec match_names_metrics(binary(), list(), keyword()) :: list()
94103
@doc """
95104
Compare a string against a list of strings. Matches are determined by algorithem metrics equal to or higher than the
96105
`match_at` option. Return a list of strings that are a likely match and their algorithm metrics.

lib/akin/algorithms/helpers/initials_comparison.ex

-4
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,6 @@ defmodule Akin.Helpers.InitialsComparison do
6161
Enum.map(lists, fn list -> String.at(list, 0) end)
6262
end
6363

64-
defp initials(list) when is_list(list) do
65-
Enum.map(list, fn l -> String.at(l, 0) end)
66-
end
67-
6864
defp initials(_), do: []
6965

7066
defp actual_initials(list) do

lib/akin/algorithms/levenshtein.ex

+6-5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ defmodule Akin.Levenshtein do
55
@behaviour Akin.Task
66
alias Akin.Corpus
77

8+
@spec compare(list(), list()) :: float()
89
@spec compare(%Corpus{}, %Corpus{}) :: float()
910
@spec compare(%Corpus{}, %Corpus{}, Keyword.t()) :: float()
1011
@doc """
@@ -21,16 +22,16 @@ defmodule Akin.Levenshtein do
2122

2223
def compare([], string), do: length(string)
2324

24-
def compare(left, right) when is_binary(left) and is_binary(right) do
25-
distance = compare(String.graphemes(left), String.graphemes(right))
26-
1.0 - distance / Enum.max([String.length(left), String.length(right)])
27-
end
28-
2925
def compare(left, right)
3026
when is_list(left) and is_list(right) do
3127
rec_lev(left, right, :lists.seq(0, length(right)), 1)
3228
end
3329

30+
def compare(left, right) when is_binary(left) and is_binary(right) do
31+
distance = compare(String.graphemes(left), String.graphemes(right))
32+
1.0 - distance / Enum.max([String.length(left), String.length(right)])
33+
end
34+
3435
defp rec_lev([src_head | src_tail], right, distlist, step) do
3536
rec_lev(src_tail, right, lev_dist(right, distlist, src_head, [step], step), step + 1)
3637
end

lib/akin/algorithms/phonetic/word.ex

+3-4
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,8 @@ defmodule Word do
4646
end
4747

4848
def normalize(input) do
49-
String.normalize(input, :nfd)
50-
|> String.graphemes()
51-
|> Enum.reduce([], fn l, acc -> if Unicode.category(l) == "Mn", do: acc, else: [l | acc] end)
52-
|> Enum.reverse()
49+
input
50+
|> String.normalize(:nfd)
51+
|> Unicode.unaccent()
5352
end
5453
end

lib/akin/algorithms/substring_set.ex

+7-5
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ defmodule Akin.SubstringSet do
3939
) do
4040
case Strategy.determine(l_string, r_string) do
4141
:standard ->
42-
similarity(l_set, r_set) |> score(opts(opts, :level))
42+
similarity(l_set, r_set)
43+
|> score(opts(opts, :level))
4344

4445
{:substring, scale} ->
4546
score =
@@ -53,10 +54,11 @@ defmodule Akin.SubstringSet do
5354
end
5455
end
5556

56-
defp score(scores, "weak"), do: Enum.max(scores)
57-
58-
defp score(scores, _) do
59-
Enum.sum(scores) / (Enum.count(scores) - 1)
57+
defp score(scores, level) do
58+
case level do
59+
"weak" -> Enum.max(scores)
60+
_ -> Enum.sum(scores) / (Enum.count(scores) - 1)
61+
end
6062
end
6163

6264
defp similarity(left, right) do

lib/akin/util.ex

+1-51
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,6 @@ defmodule Akin.Util do
9393
|> :unicode.characters_to_nfd_binary()
9494
end
9595

96-
defp replace_accents({:error, string, _b}), do: string
97-
9896
defp replace_accents(string) do
9997
string = String.replace(string, ~r/\W/u, "")
10098

@@ -253,54 +251,6 @@ defmodule Akin.Util do
253251
|> Enum.sort()
254252
end
255253

256-
257-
# defp list_algorithms(metric, nil, _) do
258-
# Enum.reduce(@typed_algorithms, [], fn {name, m, _}, acc ->
259-
# if metric == m, do: [name | acc], else: = acc
260-
# end)
261-
# |> Enum.sort()
262-
# end
263-
264-
# defp list_algorithms(nil, unit, _) do
265-
# Enum.reduce(@typed_algorithms, [], fn {name, _, u}, acc ->
266-
# if metric == m, do: [name | acc], else: = acc
267-
# end)
268-
# |> Enum.sort()
269-
# end
270-
271-
# defp list_algorithms(nil, nil, []), do: @algorithms
272-
273-
# defp list_algorithms(metric, unit, []) do
274-
# Enum.filter(@typed_algorithms, fn {_, m, u} ->
275-
# unit == u && metric == m
276-
# end)
277-
# # ["bag_distance", "levenshtein", "jaro_winkler", "jaccard", "tversky", "sorensen_dice"]
278-
# end
279-
280-
# defp list_algorithms(_, _, algorithms) when is_list(algorithms) do
281-
# Enum.filter(algorithms, fn a -> a in @algorithms end)
282-
# end
283-
284-
# defp list_algorithms("string", "partial", []) do
285-
# ["substring_set", "substring_sort", "overlap", "ngram"]
286-
# end
287-
288-
# defp list_algorithms("string", _, []) do
289-
# list_algorithms("string", "whole", []) ++ list_algorithms("string", "partial", [])
290-
# end
291-
292-
# defp list_algorithms("phonetic", "whole", []) do
293-
# ["metaphone", "double_metaphone"]
294-
# end
295-
296-
# defp list_algorithms("phonetic", "partial", []) do
297-
# ["substring_double_metaphone"]
298-
# end
299-
300-
# defp list_algorithms("phonetic", _, []) do
301-
# list_algorithms("phonetic", "whole", []) ++ list_algorithms("phonetic", "partial", [])
302-
# end
303-
304254
@spec ngram_tokenize(any, any) :: list
305255
@doc """
306256
Tokenizes the input into N-grams (http://en.wikipedia.org/wiki/N-gram).
@@ -323,7 +273,7 @@ defmodule Akin.Util do
323273

324274
def ngram_tokenize(_), do: []
325275

326-
@spec opts(keyword(), atom()) :: integer() | boolean()
276+
@spec opts(keyword(), atom()) :: any()
327277
@doc """
328278
Take the value for the key from the options. If not present, use the default value from the default
329279
options list.

mix.exs

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ defmodule Akin.Mixfile do
3030
defp deps do
3131
[
3232
{:credo, "~> 1.0", only: :dev},
33+
{:dialyxir, "~> 1.3", only: [:dev], runtime: false},
3334
{:earmark, "~> 1.3", only: :dev},
3435
{:excoveralls, "~> 0.10", only: :test},
3536
{:ex_doc, "~> 0.19", only: :dev},

notebooks/examples.livemd

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
# Akin Examples
2+
3+
## Akin
4+
5+
Akin is a collection of string comparison algorithms for Elixir. Algorithms can be called independently or combined to return a map of metrics. This library was built to facilitiate the disambiguation of names but can be used to compare any two binaries.
6+
7+
## Algorithms
8+
9+
Utilities are provided to return all avialable algorithms.
10+
11+
```elixir
12+
Akin.Util.list_algorithms()
13+
```
14+
15+
**Note**: Hamming Distance is excluded as it only compares strings of equal length. To use the Hamming Distance algorithm, call it directly (see: [Independent Algorithms](#independent-algorithms)).
16+
17+
## Combined Algorithms
18+
19+
### Metrics
20+
21+
Results from all algorithms are returned as a map of metrics.
22+
23+
<!-- livebook:{"break_markdown":true} -->
24+
25+
#### Compare Strings
26+
27+
Experiment by changing the value of the strings.
28+
29+
```elixir
30+
a = "weird"
31+
b = "wierd"
32+
33+
Akin.compare(a, b)
34+
```
35+
36+
### Options
37+
38+
Comparison accepts options in a Keyword list.
39+
40+
1. `algorithms`: algorithms to use in comparision. Accepts the name or a keyword list. Default is algorithms/0.
41+
1. `metric` - algorithm metric. Default is both
42+
* "string": uses string algorithms
43+
* "phonetic": uses phonetic algorithms
44+
2. `unit` - algorithm unit. Default is both.
45+
* "whole": uses algorithms best suited for whole string comparison (distance)
46+
* "partial": uses algorithms best suited for partial string comparison (substring)
47+
2. `level` - level for double phonetic matching. Default is "normal".
48+
* "strict": both encodings for each string must match
49+
* "strong": the primary encoding for each string must match
50+
* "normal": the primary encoding of one string must match either encoding of other string (default)
51+
* "weak": either primary or secondary encoding of one string must match one encoding of other string
52+
3. `match_at`: an algorith score equal to or above this value is condsidered a match. Default is 0.9
53+
4. `ngram_size`: number of contiguous letters to split strings into. Default is 2.
54+
5. `short_length`: qualifies as "short" to recieve a shortness boost. Used by Name Metric. Default is 8.
55+
6. `stem`: boolean representing whether to compare the stemmed version the strings; uses Stemmer. Default `false`
56+
57+
```elixir
58+
opts = [algorithms: ["bag_distance", "jaccard", "jaro_winkler"]]
59+
Akin.compare(a, b, opts)
60+
```
61+
62+
```elixir
63+
opts = [algorithms: [metric: "phonetic", unit: "whole"]]
64+
Akin.compare(a, b, opts)
65+
```
66+
67+
```elixir
68+
Akin.compare(a, b, algorithms: [metric: "string", unit: "whole"], ngram_size: 1)
69+
```
70+
71+
#### n-gram Size
72+
73+
The default ngram size for the algorithms is 2. You can change by setting
74+
a value in opts.
75+
76+
```elixir
77+
opts = [algorithms: ["sorensen_dice"]]
78+
Akin.compare(a, b, opts)
79+
```
80+
81+
```elixir
82+
opts = [algorithms: ["sorensen_dice"], ngram_size: 1]
83+
Akin.compare(a, b, opts)
84+
```
85+
86+
#### Match Level
87+
88+
The default match strictness is "normal" You change it by setting
89+
a value in opts. Currently it only affects the outcomes of the `substring_set` and
90+
`double_metaphone` algorithms
91+
92+
```elixir
93+
left = "Alice in Wonderland"
94+
right = "Alice's Adventures in Wonderland"
95+
96+
Akin.compare(left, right, algorithms: ["substring_set"])
97+
```
98+
99+
```elixir
100+
Akin.compare(left, right, algorithms: ["substring_set"], level: "weak")
101+
```
102+
103+
```elixir
104+
left = "which way"
105+
right = "whitch way"
106+
107+
Akin.compare(left, right, algorithms: ["double_metaphone"], level: "weak")
108+
```
109+
110+
```elixir
111+
Akin.compare(left, right, algorithms: ["double_metaphone"], level: "strict")
112+
```
113+
114+
#### Stems
115+
116+
Compare the stemmed version of two strings.
117+
118+
```elixir
119+
not_gerund = "write"
120+
gerund = "writing"
121+
122+
Akin.compare(not_gerund, gerund, algorithms: ["bag_distance", "double_metaphone"])
123+
```
124+
125+
```elixir
126+
Akin.compare(not_gerund, gerund, algorithms: ["bag_distance", "double_metaphone"], stem: true)
127+
```
128+
129+
### Preprocessing
130+
131+
Before being compared, strings are converted to downcase and unicode standard, whitespace is standardized, nontext (like punctuation & emojis) is replaced, and accents are converted. The string is then composed into a struct representing the corpus of data used by the comparison algorithms.
132+
133+
```elixir
134+
name = "Alice Liddell"
135+
136+
Akin.Util.compose(name)
137+
```
138+
139+
### Accents
140+
141+
```elixir
142+
name_a = "Hubert Łępicki"
143+
144+
Akin.Util.compose(name_a)
145+
```
146+
147+
```elixir
148+
name_b = "Hubert Lepicki"
149+
150+
Akin.compare(name_a, name_b)
151+
```
152+
153+
### Phonemes
154+
155+
```elixir
156+
Akin.phonemes(name)
157+
```
158+
159+
```elixir
160+
Akin.phonemes("wonderland")
161+
```
162+
163+
## Independent Algorithms
164+
165+
Each algorithm can be called directly. Module names are camelcased versions of the the snakecased algorithm names returned by `list_algorithms/0`.
166+
167+
```elixir
168+
a = Akin.Util.compose("weird")
169+
b = Akin.Util.compose("wierd")
170+
Akin.BagDistance.compare(a, b)
171+
```
172+
173+
Hamming Distance is excluded from `list_algorithms/0` and the combined algorithm metrics as it only compares strings of equal length. To use the Hamming Distance algorithm, call it directly.
174+
175+
```elixir
176+
Akin.Hamming.compare("weird", "wierd")
177+
```

0 commit comments

Comments
 (0)