Skip to content

Commit b081de0

Browse files
author
Harald Scheidl
committed
renamed labels to chars to be more consistent with other repos
1 parent 62ae2b0 commit b081de0

11 files changed

+88
-88
lines changed

README.md

+10-10
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,18 @@ import numpy as np
2323
from ctc_decoder import best_path, beam_search
2424

2525
mat = np.array([[0.4, 0, 0.6], [0.4, 0, 0.6]])
26-
labels = 'ab'
26+
chars = 'ab'
2727

28-
print(f'Best path: "{best_path(mat, labels)}"')
29-
print(f'Beam search: "{beam_search(mat, labels)}"')
28+
print(f'Best path: "{best_path(mat, chars)}"')
29+
print(f'Beam search: "{beam_search(mat, chars)}"')
3030
````
3131

3232
The output `mat` (numpy array, softmax already applied) of the CTC-trained neural network is expected to have shape TxC
3333
and is passed as the first argument to the decoders.
3434
T is the number of time-steps, and C the number of characters (the CTC-blank is the last element).
35-
The labels predicted by the neural network are passed as the `labels` string to the decoder.
35+
The characters that can be predicted by the neural network are passed as the `chars` string to the decoder.
3636
Decoders return the decoded string.
37-
This should output:
37+
Running the code outputs:
3838

3939
````
4040
Best path: ""
@@ -48,17 +48,17 @@ please have a look at the scripts in the `tests/` folder.
4848

4949
### Language model and BK-tree
5050

51-
Beam search can use a character-level language model.
51+
Beam search can optionally integrate a character-level language model.
5252
Text statistics (bigrams) are used by beam search to improve reading accuracy.
5353

5454
````python
5555
from ctc_decoder import beam_search, LanguageModel
5656

5757
# create language model instance from a (large) text
58-
lm = LanguageModel('this is some text', labels)
58+
lm = LanguageModel('this is some text', chars)
5959

6060
# and use it in the beam search decoder
61-
res = beam_search(mat, labels, lm=lm)
61+
res = beam_search(mat, chars, lm=lm)
6262
````
6363

6464
The lexicon search decoder computes a first approximation with best path decoding.
@@ -73,7 +73,7 @@ from ctc_decoder import lexicon_search, BKTree
7373
bk_tree = BKTree(['words', 'from', 'a', 'dictionary'])
7474

7575
# and use the tree in the lexicon search
76-
res = lexicon_search(mat, labels, bk_tree, tolerance=2)
76+
res = lexicon_search(mat, chars, bk_tree, tolerance=2)
7777
````
7878

7979
### Usage with deep learning frameworks
@@ -100,7 +100,7 @@ Other decoders, from my experience not really suited for practical purposes,
100100
but might be used for experiments or research:
101101
* `prefix_search`: prefix search decoder
102102
* `token_passing`: token passing algorithm
103-
* Best path decoder implementation using OpenCL (see `extras/` folder)
103+
* Best path decoder implementation in OpenCL (see `extras/` folder)
104104

105105
[This paper](./doc/comparison.pdf) gives suggestions when to use best path decoding, beam search decoding and token passing.
106106

ctc_decoder/beam_search.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ def sort(self):
3636
return [x.labeling for x in sorted_beams]
3737

3838

39-
def apply_lm(parent_beam, child_beam, labels, lm):
39+
def apply_lm(parent_beam, child_beam, chars, lm):
4040
"""Calculate LM score of child beam by taking score from parent beam and bigram probability of last two chars."""
4141
if lm and not child_beam.lm_applied:
42-
c1 = labels[parent_beam.labeling[-1] if parent_beam.labeling else labels.index(' ')] # first char
43-
c2 = labels[child_beam.labeling[-1]] # second char
42+
c1 = chars[parent_beam.labeling[-1] if parent_beam.labeling else chars.index(' ')] # first char
43+
c2 = chars[child_beam.labeling[-1]] # second char
4444
lm_factor = 0.01 # influence of language model
4545
bigram_prob = lm.get_char_bigram(c1, c2) ** lm_factor
4646
child_beam.pr_text = parent_beam.pr_text * bigram_prob # probability of char sequence
@@ -53,22 +53,22 @@ def add_beam(beam_state, labeling):
5353
beam_state.entries[labeling] = BeamEntry()
5454

5555

56-
def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional[LanguageModel] = None) -> str:
56+
def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[LanguageModel] = None) -> str:
5757
"""Beam search decoder.
5858
5959
See the paper of Hwang et al. and the paper of Graves et al.
6060
6161
Args:
6262
mat: Output of neural network of shape TxC.
63-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
63+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
6464
beam_width: Number of beams kept per iteration.
6565
lm: Character level language model if specified.
6666
6767
Returns:
6868
The decoded text.
6969
"""
7070

71-
blank_idx = len(labels)
71+
blank_idx = len(chars)
7272
max_T, max_C = mat.shape
7373

7474
# initialise beam state
@@ -129,7 +129,7 @@ def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional
129129
curr.entries[new_labeling].pr_total += pr_non_blank
130130

131131
# apply LM
132-
apply_lm(curr.entries[labeling], curr.entries[new_labeling], labels, lm)
132+
apply_lm(curr.entries[labeling], curr.entries[new_labeling], chars, lm)
133133

134134
# set new beam state
135135
last = curr
@@ -140,6 +140,6 @@ def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional
140140
# sort by probability
141141
best_labeling = last.sort()[0] # get most probable labeling
142142

143-
# map labels to chars
144-
res = ''.join([labels[l] for l in best_labeling])
143+
# map label string to char string
144+
res = ''.join([chars[l] for l in best_labeling])
145145
return res

ctc_decoder/best_path.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
import numpy as np
44

55

6-
def best_path(mat: np.ndarray, labels: str) -> str:
6+
def best_path(mat: np.ndarray, chars: str) -> str:
77
"""Best path (greedy) decoder.
88
99
Take best-scoring character per time-step, then remove repeated characters and CTC blank characters.
1010
See dissertation of Graves, p63.
1111
1212
Args:
1313
mat: Output of neural network of shape TxC.
14-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
14+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
1515
1616
Returns:
1717
The decoded text.
@@ -21,7 +21,7 @@ def best_path(mat: np.ndarray, labels: str) -> str:
2121
best_path_indices = np.argmax(mat, axis=1)
2222

2323
# collapse best path (using itertools.groupby), map to chars, join char list to string
24-
blank_idx = len(labels)
25-
best_chars_collapsed = [labels[k] for k, _ in groupby(best_path_indices) if k != blank_idx]
24+
blank_idx = len(chars)
25+
best_chars_collapsed = [chars[k] for k, _ in groupby(best_path_indices) if k != blank_idx]
2626
res = ''.join(best_chars_collapsed)
2727
return res

ctc_decoder/common.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
def extend_by_blanks(seq, b):
2-
"extends a label seq. by adding blanks at the beginning, end and in between each label"
2+
"""Extend a label seq. by adding blanks at the beginning, end and in between each label."""
33
res = [b]
44
for s in seq:
55
res.append(s)
66
res.append(b)
77
return res
88

99

10-
def word_to_label_seq(w, labels):
11-
"map a word to a sequence of labels (indices)"
12-
res = [labels.index(c) for c in w]
10+
def word_to_label_seq(w, chars):
11+
"""Map a word (string of characters) to a sequence of labels (indices)."""
12+
res = [chars.index(c) for c in w]
1313
return res

ctc_decoder/language_model.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
class LanguageModel:
22
"Simple character-level language model."
33

4-
def __init__(self, txt: str, labels: str) -> None:
4+
def __init__(self, txt: str, chars: str) -> None:
55
"""Create language model from text corpus."""
66
txt = ' ' + txt + ' ' # ensure first/last characters appear next to whitespace
7-
self._init_char_bigrams(txt, labels)
7+
self._init_char_bigrams(txt, chars)
88

9-
def _init_char_bigrams(self, txt, labels):
9+
def _init_char_bigrams(self, txt: str, chars: str) -> None:
1010
"""Initialize table of character bigrams."""
1111

1212
# init bigrams with 0 values
13-
self.bigram = {c: {d: 0 for d in labels} for c in labels}
13+
self.bigram = {c: {d: 0 for d in chars} for c in chars}
1414

1515
# go through text and add each char bigram
1616
for i in range(len(txt) - 1):

ctc_decoder/lexicon_search.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ctc_decoder.loss import probability
66

77

8-
def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int) -> str:
8+
def lexicon_search(mat: np.ndarray, chars: str, bk_tree: BKTree, tolerance: int) -> str:
99
"""Lexicon search decoder.
1010
1111
The algorithm computes a first approximation using best path decoding. Similar words are queried using the BK tree.
@@ -14,7 +14,7 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
1414
1515
Args:
1616
mat: Output of neural network of shape TxC.
17-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
17+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
1818
bk_tree: Instance of BKTree which is used to query similar words.
1919
tolerance: Words to be considered, which are within specified edit distance.
2020
@@ -23,7 +23,7 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
2323
"""
2424

2525
# use best path decoding to get an approximation
26-
approx = best_path(mat, labels)
26+
approx = best_path(mat, chars)
2727

2828
# get similar words from dictionary within given tolerance
2929
words = bk_tree.query(approx, tolerance)
@@ -33,6 +33,6 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
3333
return ''
3434

3535
# else compute probabilities of all similar words and return best scoring one
36-
word_probs = [(w, probability(mat, w, labels)) for w in words]
36+
word_probs = [(w, probability(mat, w, chars)) for w in words]
3737
word_probs.sort(key=lambda x: x[1], reverse=True)
3838
return word_probs[0][0]

ctc_decoder/loss.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,23 @@ def empty_cache(max_T, labeling_with_blanks):
5151
return [[None for _ in range(len(labeling_with_blanks))] for _ in range(max_T)]
5252

5353

54-
def probability(mat: np.ndarray, gt: str, labels: str) -> float:
54+
def probability(mat: np.ndarray, gt: str, chars: str) -> float:
5555
"""Compute probability of ground truth text gt given neural network output mat.
5656
5757
See the CTC Forward-Backward Algorithm in Graves paper.
5858
5959
Args:
6060
mat: Output of neural network of shape TxC.
6161
gt: Ground truth text.
62-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
62+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
6363
6464
Returns:
6565
The probability of the text given the neural network output.
6666
"""
6767

6868
max_T, _ = mat.shape # size of input matrix
69-
blank = len(labels) # index of blank label
70-
labeling_with_blanks = common.extend_by_blanks(common.word_to_label_seq(gt, labels), blank)
69+
blank = len(chars) # index of blank label
70+
labeling_with_blanks = common.extend_by_blanks(common.word_to_label_seq(gt, chars), blank)
7171
cache = empty_cache(max_T, labeling_with_blanks)
7272

7373
p1 = recursive_probability(max_T - 1, len(labeling_with_blanks) - 1, mat, labeling_with_blanks, blank, cache)
@@ -76,21 +76,21 @@ def probability(mat: np.ndarray, gt: str, labels: str) -> float:
7676
return p
7777

7878

79-
def loss(mat: np.ndarray, gt: str, labels: str) -> float:
79+
def loss(mat: np.ndarray, gt: str, chars: str) -> float:
8080
"""Compute loss of ground truth text gt given neural network output mat.
8181
8282
See the CTC Forward-Backward Algorithm in Graves paper.
8383
8484
Args:
8585
mat: Output of neural network of shape TxC.
8686
gt: Ground truth text.
87-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
87+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
8888
8989
Returns:
9090
The probability of the text given the neural network output.
9191
"""
9292

9393
try:
94-
return -math.log(probability(mat, gt, labels))
94+
return -math.log(probability(mat, gt, chars))
9595
except ValueError:
9696
return float('inf')

ctc_decoder/prefix_search.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
import numpy as np
22

33

4-
def prefix_search(mat: np.ndarray, labels: str) -> str:
4+
def prefix_search(mat: np.ndarray, chars: str) -> str:
55
"""Prefix search decoding.
66
77
See dissertation of Graves, p63-66.
88
99
Args:
1010
mat: Output of neural network of shape TxC.
11-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
11+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
1212
1313
Returns:
1414
The decoded text.
1515
"""
1616

17-
blank_idx = len(labels)
17+
blank_idx = len(chars)
1818
max_T, max_C = mat.shape
1919

2020
# g_n and g_b: gamma in paper
@@ -41,17 +41,17 @@ def prefix_search(mat: np.ndarray, labels: str) -> str:
4141
while prob_ext[y_star] > prob[l_star]:
4242
prob_remaining = prob_ext[y_star]
4343

44-
# for all labels
44+
# for all chars
4545
for k in range(max_C - 1):
46-
y = y_star + labels[k]
46+
y = y_star + chars[k]
4747
g_n[0][y] = mat[0, k] if len(y_star) == 0 else 0
4848
g_b[0][y] = 0
4949
prefix_prob = g_n[0][y]
5050

5151
# for all time steps
5252
for t in range(1, max_T):
5353
new_label_prob = g_b[t - 1][y_star] + (
54-
0 if y_star != '' and y_star[-1] == labels[k] else g_n[t - 1][y_star])
54+
0 if y_star != '' and y_star[-1] == chars[k] else g_n[t - 1][y_star])
5555
g_n[t][y] = mat[t, k] * (new_label_prob + g_n[t - 1][y])
5656
g_b[t][y] = mat[t, blank_idx] * (g_b[t - 1][y] + g_n[t - 1][y])
5757
prefix_prob += mat[t, k] * new_label_prob
@@ -87,20 +87,20 @@ def prefix_search(mat: np.ndarray, labels: str) -> str:
8787
return l_star
8888

8989

90-
def prefix_search_heuristic_split(mat: np.ndarray, labels: str) -> str:
90+
def prefix_search_heuristic_split(mat: np.ndarray, chars: str) -> str:
9191
"""Prefix search decoding with heuristic to speed up the algorithm.
9292
9393
Speed up prefix computation by splitting sequence into subsequences as described by Graves (p66).
9494
9595
Args:
9696
mat: Output of neural network of shape TxC.
97-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
97+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
9898
9999
Returns:
100100
The decoded text.
101101
"""
102102

103-
blank_idx = len(labels)
103+
blank_idx = len(chars)
104104
max_T, _ = mat.shape
105105

106106
# split sequence into 3 subsequences, splitting points should be roughly placed at 1/3 and 2/3
@@ -124,6 +124,6 @@ def prefix_search_heuristic_split(mat: np.ndarray, labels: str) -> str:
124124
for i in range(len(ranges) - 1):
125125
beg = ranges[i]
126126
end = ranges[i + 1]
127-
res += prefix_search(mat[beg: end, :], labels)
127+
res += prefix_search(mat[beg: end, :], chars)
128128

129129
return res

ctc_decoder/token_passing.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -55,29 +55,29 @@ def log(val):
5555
return float('-inf')
5656

5757

58-
def token_passing(mat: np.ndarray, labels: str, words: List[str]) -> str:
58+
def token_passing(mat: np.ndarray, chars: str, words: List[str]) -> str:
5959
"""Token passing algorithm.
6060
6161
See dissertation of Graves, p67-69.
6262
6363
Args:
6464
mat: Output of neural network of shape TxC.
65-
labels: The set of characters the neural network can recognize, excluding the CTC-blank.
65+
chars: The set of characters the neural network can recognize, excluding the CTC-blank.
6666
words: List of words that can be recognized.
6767
6868
Returns:
6969
The decoded text.
7070
"""
7171

72-
blank_idx = len(labels)
72+
blank_idx = len(chars)
7373
max_T, _ = mat.shape
7474

7575
# special s index for beginning and end of word
7676
beg = 0
7777
end = -1
7878

7979
# map characters to labels for each word
80-
label_words = [common.word_to_label_seq(w, labels) for w in words]
80+
label_words = [common.word_to_label_seq(w, chars) for w in words]
8181

8282
# w' in paper: word with blanks in front, back and between labels: for -> _f_o_r_
8383
prime_words = [common.extend_by_blanks(w, blank_idx) for w in label_words]

0 commit comments

Comments
 (0)