Skip to content

Commit 86aceff

Browse files
committed
add untracked file
1 parent 48c9960 commit 86aceff

File tree

14 files changed

+3949
-0
lines changed

14 files changed

+3949
-0
lines changed

common/misc/diffseq.h

Lines changed: 532 additions & 0 deletions
Large diffs are not rendered by default.

common/misc/fstrcmp.c

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
/* Functions to make fuzzy comparisons between strings
2+
Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2019 Free
3+
Software Foundation, Inc.
4+
5+
This program is free software: you can redistribute it and/or modify
6+
it under the terms of the GNU General Public License as published by
7+
the Free Software Foundation; either version 3 of the License, or
8+
(at your option) any later version.
9+
10+
This program is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU General Public License for more details.
14+
15+
You should have received a copy of the GNU General Public License
16+
along with this program. If not, see <https://www.gnu.org/licenses/>. */
17+
18+
19+
#include <config.h>
20+
21+
/* Specification. */
22+
#include "fstrcmp.h"
23+
24+
#include <string.h>
25+
#include <stdbool.h>
26+
#include <stddef.h>
27+
#include <stdio.h>
28+
#include <stdint.h>
29+
#include <stdlib.h>
30+
#include <limits.h>
31+
32+
#include "glthread/lock.h"
33+
#include "glthread/tls.h"
34+
#include "minmax.h"
35+
#include "xalloc.h"
36+
37+
38+
#define ELEMENT char
39+
#define EQUAL(x,y) ((x) == (y))
40+
#define OFFSET ptrdiff_t
41+
#define EXTRA_CONTEXT_FIELDS \
42+
/* The number of edits beyond which the computation can be aborted. */ \
43+
ptrdiff_t edit_count_limit; \
44+
/* The number of edits (= number of elements inserted, plus the number of \
45+
elements deleted), temporarily minus edit_count_limit. */ \
46+
ptrdiff_t edit_count;
47+
#define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++
48+
#define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++
49+
#define EARLY_ABORT(ctxt) ctxt->edit_count > 0
50+
/* We don't need USE_HEURISTIC, since it is unlikely in typical uses of
51+
fstrcmp(). */
52+
#include "diffseq.h"
53+
54+
55+
/* Because fstrcmp is typically called multiple times, attempt to minimize
56+
the number of memory allocations performed. Thus, let a call reuse the
57+
memory already allocated by the previous call, if it is sufficient.
58+
To make it multithread-safe, without need for a lock that protects the
59+
already allocated memory, store the allocated memory per thread. Free
60+
it only when the thread exits. */
61+
62+
static gl_tls_key_t buffer_key; /* TLS key for a 'ptrdiff_t *' */
63+
static gl_tls_key_t bufmax_key; /* TLS key for a 'uintptr_t' */
64+
65+
static void
66+
keys_init (void)
67+
{
68+
gl_tls_key_init (buffer_key, free);
69+
gl_tls_key_init (bufmax_key, NULL);
70+
/* The per-thread initial values are NULL and 0, respectively. */
71+
}
72+
73+
/* Ensure that keys_init is called once only. */
74+
gl_once_define(static, keys_init_once)
75+
76+
77+
/* In the code below, branch probabilities were measured by Ralf Wildenhues,
78+
by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many
79+
values of LL. The probability indicates that the condition evaluates
80+
to true; whether that leads to a branch or a non-branch in the code,
81+
depends on the compiler's reordering of basic blocks. */
82+
83+
84+
double
85+
fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
86+
{
87+
struct context ctxt;
88+
size_t xvec_length = strlen (string1);
89+
size_t yvec_length = strlen (string2);
90+
size_t length_sum = xvec_length + yvec_length;
91+
ptrdiff_t i;
92+
93+
ptrdiff_t fdiag_len;
94+
ptrdiff_t *buffer;
95+
uintptr_t bufmax;
96+
97+
/* short-circuit obvious comparisons */
98+
if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */
99+
return length_sum == 0;
100+
101+
if (! (xvec_length <= length_sum
102+
&& length_sum <= MIN (UINTPTR_MAX, PTRDIFF_MAX) - 3))
103+
xalloc_die ();
104+
105+
if (lower_bound > 0)
106+
{
107+
/* Compute a quick upper bound.
108+
Each edit is an insertion or deletion of an element, hence modifies
109+
the length of the sequence by at most 1.
110+
Therefore, when starting from a sequence X and ending at a sequence Y,
111+
with N edits, | yvec_length - xvec_length | <= N. (Proof by
112+
induction over N.)
113+
So, at the end, we will have
114+
edit_count >= | xvec_length - yvec_length |.
115+
and hence
116+
result
117+
= (xvec_length + yvec_length - edit_count)
118+
/ (xvec_length + yvec_length)
119+
<= (xvec_length + yvec_length - | yvec_length - xvec_length |)
120+
/ (xvec_length + yvec_length)
121+
= 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
122+
*/
123+
ptrdiff_t length_min = MIN (xvec_length, yvec_length);
124+
volatile double upper_bound = 2.0 * length_min / length_sum;
125+
126+
if (upper_bound < lower_bound) /* Prob: 74% */
127+
/* Return an arbitrary value < LOWER_BOUND. */
128+
return 0.0;
129+
130+
#if CHAR_BIT <= 8
131+
/* When X and Y are both small, avoid the overhead of setting up an
132+
array of size 256. */
133+
if (length_sum >= 20) /* Prob: 99% */
134+
{
135+
/* Compute a less quick upper bound.
136+
Each edit is an insertion or deletion of a character, hence
137+
modifies the occurrence count of a character by 1 and leaves the
138+
other occurrence counts unchanged.
139+
Therefore, when starting from a sequence X and ending at a
140+
sequence Y, and denoting the occurrence count of C in X with
141+
OCC (X, C), with N edits,
142+
sum_C | OCC (X, C) - OCC (Y, C) | <= N.
143+
(Proof by induction over N.)
144+
So, at the end, we will have
145+
edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |,
146+
and hence
147+
result
148+
= (xvec_length + yvec_length - edit_count)
149+
/ (xvec_length + yvec_length)
150+
<= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |)
151+
/ (xvec_length + yvec_length).
152+
*/
153+
ptrdiff_t occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */
154+
ptrdiff_t sum;
155+
double dsum;
156+
157+
/* Determine the occurrence counts in X. */
158+
memset (occ_diff, 0, sizeof (occ_diff));
159+
for (i = xvec_length - 1; i >= 0; i--)
160+
occ_diff[(unsigned char) string1[i]]++;
161+
/* Subtract the occurrence counts in Y. */
162+
for (i = yvec_length - 1; i >= 0; i--)
163+
occ_diff[(unsigned char) string2[i]]--;
164+
/* Sum up the absolute values. */
165+
sum = 0;
166+
for (i = 0; i <= UCHAR_MAX; i++)
167+
{
168+
ptrdiff_t d = occ_diff[i];
169+
sum += (d >= 0 ? d : -d);
170+
}
171+
172+
dsum = sum;
173+
upper_bound = 1.0 - dsum / length_sum;
174+
175+
if (upper_bound < lower_bound) /* Prob: 66% */
176+
/* Return an arbitrary value < LOWER_BOUND. */
177+
return 0.0;
178+
}
179+
#endif
180+
}
181+
182+
/* set the info for each string. */
183+
ctxt.xvec = string1;
184+
ctxt.yvec = string2;
185+
186+
/* Set TOO_EXPENSIVE to be approximate square root of input size,
187+
bounded below by 4096. */
188+
ctxt.too_expensive = 1;
189+
for (i = xvec_length + yvec_length; i != 0; i >>= 2)
190+
ctxt.too_expensive <<= 1;
191+
if (ctxt.too_expensive < 4096)
192+
ctxt.too_expensive = 4096;
193+
194+
/* Allocate memory for fdiag and bdiag from a thread-local pool. */
195+
fdiag_len = length_sum + 3;
196+
gl_once (keys_init_once, keys_init);
197+
buffer = gl_tls_get (buffer_key);
198+
bufmax = (uintptr_t) gl_tls_get (bufmax_key);
199+
if (fdiag_len > bufmax)
200+
{
201+
/* Need more memory. */
202+
bufmax = 2 * bufmax;
203+
if (fdiag_len > bufmax)
204+
bufmax = fdiag_len;
205+
/* Calling xrealloc would be a waste: buffer's contents does not need
206+
to be preserved. */
207+
free (buffer);
208+
buffer = xnmalloc (bufmax, 2 * sizeof *buffer);
209+
gl_tls_set (buffer_key, buffer);
210+
gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
211+
}
212+
ctxt.fdiag = buffer + yvec_length + 1;
213+
ctxt.bdiag = ctxt.fdiag + fdiag_len;
214+
215+
/* The edit_count is only ever increased. The computation can be aborted
216+
when
217+
(xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length)
218+
< lower_bound,
219+
or equivalently
220+
edit_count > (xvec_length + yvec_length) * (1 - lower_bound)
221+
or equivalently
222+
edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)).
223+
We need to add an epsilon inside the floor(...) argument, to neutralize
224+
rounding errors. */
225+
ctxt.edit_count_limit =
226+
(lower_bound < 1.0
227+
? (ptrdiff_t) (length_sum * (1.0 - lower_bound + 0.000001))
228+
: 0);
229+
230+
/* Now do the main comparison algorithm */
231+
ctxt.edit_count = - ctxt.edit_count_limit;
232+
if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */
233+
/* The edit_count passed the limit. Hence the result would be
234+
< lower_bound. We can return any value < lower_bound instead. */
235+
return 0.0;
236+
ctxt.edit_count += ctxt.edit_count_limit;
237+
238+
/* The result is
239+
((number of chars in common) / (average length of the strings)).
240+
The numerator is
241+
= xvec_length - (number of calls to NOTE_DELETE)
242+
= yvec_length - (number of calls to NOTE_INSERT)
243+
= 1/2 * (xvec_length + yvec_length - (number of edits)).
244+
This is admittedly biased towards finding that the strings are
245+
similar, however it does produce meaningful results. */
246+
return ((double) (xvec_length + yvec_length - ctxt.edit_count)
247+
/ (xvec_length + yvec_length));
248+
}

common/misc/fstrcmp.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/* Fuzzy string comparison.
2+
Copyright (C) 1995, 2000, 2002-2003, 2006, 2008-2019 Free Software
3+
Foundation, Inc.
4+
5+
This file was written by Peter Miller <[email protected]>
6+
7+
This program is free software: you can redistribute it and/or modify
8+
it under the terms of the GNU General Public License as published by
9+
the Free Software Foundation; either version 3 of the License, or
10+
(at your option) any later version.
11+
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU General Public License for more details.
16+
17+
You should have received a copy of the GNU General Public License
18+
along with this program. If not, see <https://www.gnu.org/licenses/>. */
19+
20+
#ifndef _FSTRCMP_H
21+
#define _FSTRCMP_H
22+
23+
#ifdef __cplusplus
24+
extern "C" {
25+
#endif
26+
27+
/* Fuzzy compare of S1 and S2. Return a measure for the similarity of S1
28+
and S1. The higher the result, the more similar the strings are.
29+
The result is bounded between 0 (meaning very dissimilar strings) and
30+
1 (meaning identical strings). */
31+
extern double fstrcmp (const char *s1, const char *s2);
32+
33+
/* Like fstrcmp (S1, S2), except that if the result is < LOWER_BOUND, an
34+
arbitrary other value < LOWER_BOUND can be returned. */
35+
extern double fstrcmp_bounded (const char *s1, const char *s2,
36+
double lower_bound);
37+
38+
/* A shortcut for fstrcmp. Avoids a function call. */
39+
#define fstrcmp(s1,s2) fstrcmp_bounded (s1, s2, 0.0)
40+
41+
#ifdef __cplusplus
42+
}
43+
#endif
44+
45+
#endif

0 commit comments

Comments
 (0)