renamed labels to chars to be more consistent with other repos

Harald Scheidl · Harald Scheidl · commit b081de084fdc · 2021-07-26T22:04:40.000+02:00
diff --git a/README.md b/README.md
@@ -23,18 +23,18 @@ import numpy as np
 from ctc_decoder import best_path, beam_search
 
 mat = np.array([[0.4, 0, 0.6], [0.4, 0, 0.6]])
-labels = 'ab'
+chars = 'ab'
 
-print(f'Best path: "{best_path(mat, labels)}"')
-print(f'Beam search: "{beam_search(mat, labels)}"')
+print(f'Best path: "{best_path(mat, chars)}"')
+print(f'Beam search: "{beam_search(mat, chars)}"')
 ````
 
 The output `mat` (numpy array, softmax already applied) of the CTC-trained neural network is expected to have shape TxC 
 and is passed as the first argument to the decoders.
 T is the number of time-steps, and C the number of characters (the CTC-blank is the last element).
-The labels predicted by the neural network are passed as the `labels` string to the decoder. 
+The characters that can be predicted by the neural network are passed as the `chars` string to the decoder.
 Decoders return the decoded string.  
-This should output:
+Running the code outputs:
 
 ````
 Best path: ""
@@ -48,17 +48,17 @@ please have a look at the scripts in the `tests/` folder.
 
 ### Language model and BK-tree
 
-Beam search can use a character-level language model.
+Beam search can optionally integrate a character-level language model.
 Text statistics (bigrams) are used by beam search to improve reading accuracy.
 
 ````python
 from ctc_decoder import beam_search, LanguageModel
 
 # create language model instance from a (large) text
-lm = LanguageModel('this is some text', labels)
+lm = LanguageModel('this is some text', chars)
 
 # and use it in the beam search decoder
-res = beam_search(mat, labels, lm=lm)
+res = beam_search(mat, chars, lm=lm)
 ````
 
 The lexicon search decoder computes a first approximation with best path decoding.
@@ -73,7 +73,7 @@ from ctc_decoder import lexicon_search, BKTree
 bk_tree = BKTree(['words', 'from', 'a', 'dictionary'])
 
 # and use the tree in the lexicon search
-res = lexicon_search(mat, labels, bk_tree, tolerance=2)
+res = lexicon_search(mat, chars, bk_tree, tolerance=2)
 ````
 
 ### Usage with deep learning frameworks
@@ -100,7 +100,7 @@ Other decoders, from my experience not really suited for practical purposes,
 but might be used for experiments or research:
 * `prefix_search`: prefix search decoder
 * `token_passing`: token passing algorithm
-* Best path decoder implementation using OpenCL (see `extras/` folder)
+* Best path decoder implementation in OpenCL (see `extras/` folder)
 
 [This paper](./doc/comparison.pdf) gives suggestions when to use best path decoding, beam search decoding and token passing.
 
diff --git a/ctc_decoder/beam_search.py b/ctc_decoder/beam_search.py
@@ -36,11 +36,11 @@ def sort(self):
         return [x.labeling for x in sorted_beams]
 
 
-def apply_lm(parent_beam, child_beam, labels, lm):
+def apply_lm(parent_beam, child_beam, chars, lm):
     """Calculate LM score of child beam by taking score from parent beam and bigram probability of last two chars."""
     if lm and not child_beam.lm_applied:
-        c1 = labels[parent_beam.labeling[-1] if parent_beam.labeling else labels.index(' ')]  # first char
-        c2 = labels[child_beam.labeling[-1]]  # second char
+        c1 = chars[parent_beam.labeling[-1] if parent_beam.labeling else chars.index(' ')]  # first char
+        c2 = chars[child_beam.labeling[-1]]  # second char
         lm_factor = 0.01  # influence of language model
         bigram_prob = lm.get_char_bigram(c1, c2) ** lm_factor
         child_beam.pr_text = parent_beam.pr_text * bigram_prob  # probability of char sequence
@@ -53,22 +53,22 @@ def add_beam(beam_state, labeling):
         beam_state.entries[labeling] = BeamEntry()
 
 
-def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional[LanguageModel] = None) -> str:
+def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[LanguageModel] = None) -> str:
     """Beam search decoder.
 
     See the paper of Hwang et al. and the paper of Graves et al.
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
         beam_width: Number of beams kept per iteration.
         lm: Character level language model if specified.
 
     Returns:
         The decoded text.
     """
 
-    blank_idx = len(labels)
+    blank_idx = len(chars)
     max_T, max_C = mat.shape
 
     # initialise beam state
@@ -129,7 +129,7 @@ def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional
                 curr.entries[new_labeling].pr_total += pr_non_blank
 
                 # apply LM
-                apply_lm(curr.entries[labeling], curr.entries[new_labeling], labels, lm)
+                apply_lm(curr.entries[labeling], curr.entries[new_labeling], chars, lm)
 
         # set new beam state
         last = curr
@@ -140,6 +140,6 @@ def beam_search(mat: np.ndarray, labels: str, beam_width: int = 25, lm: Optional
     # sort by probability
     best_labeling = last.sort()[0]  # get most probable labeling
 
-    # map labels to chars
-    res = ''.join([labels[l] for l in best_labeling])
+    # map label string to char string
+    res = ''.join([chars[l] for l in best_labeling])
     return res
diff --git a/ctc_decoder/best_path.py b/ctc_decoder/best_path.py
@@ -3,15 +3,15 @@
 import numpy as np
 
 
-def best_path(mat: np.ndarray, labels: str) -> str:
+def best_path(mat: np.ndarray, chars: str) -> str:
     """Best path (greedy) decoder.
 
     Take best-scoring character per time-step, then remove repeated characters and CTC blank characters.
     See dissertation of Graves, p63.
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
 
     Returns:
         The decoded text.
@@ -21,7 +21,7 @@ def best_path(mat: np.ndarray, labels: str) -> str:
     best_path_indices = np.argmax(mat, axis=1)
 
     # collapse best path (using itertools.groupby), map to chars, join char list to string
-    blank_idx = len(labels)
-    best_chars_collapsed = [labels[k] for k, _ in groupby(best_path_indices) if k != blank_idx]
+    blank_idx = len(chars)
+    best_chars_collapsed = [chars[k] for k, _ in groupby(best_path_indices) if k != blank_idx]
     res = ''.join(best_chars_collapsed)
     return res
diff --git a/ctc_decoder/common.py b/ctc_decoder/common.py
@@ -1,13 +1,13 @@
 def extend_by_blanks(seq, b):
-    "extends a label seq. by adding blanks at the beginning, end and in between each label"
+    """Extend a label seq. by adding blanks at the beginning, end and in between each label."""
     res = [b]
     for s in seq:
         res.append(s)
         res.append(b)
     return res
 
 
-def word_to_label_seq(w, labels):
-    "map a word to a sequence of labels (indices)"
-    res = [labels.index(c) for c in w]
+def word_to_label_seq(w, chars):
+    """Map a word (string of characters) to a sequence of labels (indices)."""
+    res = [chars.index(c) for c in w]
     return res
diff --git a/ctc_decoder/language_model.py b/ctc_decoder/language_model.py
@@ -1,16 +1,16 @@
 class LanguageModel:
     "Simple character-level language model."
 
-    def __init__(self, txt: str, labels: str) -> None:
+    def __init__(self, txt: str, chars: str) -> None:
         """Create language model from text corpus."""
         txt = ' ' + txt + ' '  # ensure first/last characters appear next to whitespace
-        self._init_char_bigrams(txt, labels)
+        self._init_char_bigrams(txt, chars)
 
-    def _init_char_bigrams(self, txt, labels):
+    def _init_char_bigrams(self, txt: str, chars: str) -> None:
         """Initialize table of character bigrams."""
 
         # init bigrams with 0 values
-        self.bigram = {c: {d: 0 for d in labels} for c in labels}
+        self.bigram = {c: {d: 0 for d in chars} for c in chars}
 
         # go through text and add each char bigram
         for i in range(len(txt) - 1):
diff --git a/ctc_decoder/lexicon_search.py b/ctc_decoder/lexicon_search.py
@@ -5,7 +5,7 @@
 from ctc_decoder.loss import probability
 
 
-def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int) -> str:
+def lexicon_search(mat: np.ndarray, chars: str, bk_tree: BKTree, tolerance: int) -> str:
     """Lexicon search decoder.
 
     The algorithm computes a first approximation using best path decoding. Similar words are queried using the BK tree.
@@ -14,7 +14,7 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
         bk_tree: Instance of BKTree which is used to query similar words.
         tolerance: Words to be considered, which are within specified edit distance.
 
@@ -23,7 +23,7 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
     """
 
     # use best path decoding to get an approximation
-    approx = best_path(mat, labels)
+    approx = best_path(mat, chars)
 
     # get similar words from dictionary within given tolerance
     words = bk_tree.query(approx, tolerance)
@@ -33,6 +33,6 @@ def lexicon_search(mat: np.ndarray, labels: str, bk_tree: BKTree, tolerance: int
         return ''
 
     # else compute probabilities of all similar words and return best scoring one
-    word_probs = [(w, probability(mat, w, labels)) for w in words]
+    word_probs = [(w, probability(mat, w, chars)) for w in words]
     word_probs.sort(key=lambda x: x[1], reverse=True)
     return word_probs[0][0]
diff --git a/ctc_decoder/loss.py b/ctc_decoder/loss.py
@@ -51,23 +51,23 @@ def empty_cache(max_T, labeling_with_blanks):
     return [[None for _ in range(len(labeling_with_blanks))] for _ in range(max_T)]
 
 
-def probability(mat: np.ndarray, gt: str, labels: str) -> float:
+def probability(mat: np.ndarray, gt: str, chars: str) -> float:
     """Compute probability of ground truth text gt given neural network output mat.
 
     See the CTC Forward-Backward Algorithm in Graves paper.
 
     Args:
         mat: Output of neural network of shape TxC.
         gt: Ground truth text.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
 
     Returns:
         The probability of the text given the neural network output.
     """
 
     max_T, _ = mat.shape  # size of input matrix
-    blank = len(labels)  # index of blank label
-    labeling_with_blanks = common.extend_by_blanks(common.word_to_label_seq(gt, labels), blank)
+    blank = len(chars)  # index of blank label
+    labeling_with_blanks = common.extend_by_blanks(common.word_to_label_seq(gt, chars), blank)
     cache = empty_cache(max_T, labeling_with_blanks)
 
     p1 = recursive_probability(max_T - 1, len(labeling_with_blanks) - 1, mat, labeling_with_blanks, blank, cache)
@@ -76,21 +76,21 @@ def probability(mat: np.ndarray, gt: str, labels: str) -> float:
     return p
 
 
-def loss(mat: np.ndarray, gt: str, labels: str) -> float:
+def loss(mat: np.ndarray, gt: str, chars: str) -> float:
     """Compute loss of ground truth text gt given neural network output mat.
 
     See the CTC Forward-Backward Algorithm in Graves paper.
 
     Args:
         mat: Output of neural network of shape TxC.
         gt: Ground truth text.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
 
     Returns:
         The probability of the text given the neural network output.
     """
 
     try:
-        return -math.log(probability(mat, gt, labels))
+        return -math.log(probability(mat, gt, chars))
     except ValueError:
         return float('inf')
diff --git a/ctc_decoder/prefix_search.py b/ctc_decoder/prefix_search.py
@@ -1,20 +1,20 @@
 import numpy as np
 
 
-def prefix_search(mat: np.ndarray, labels: str) -> str:
+def prefix_search(mat: np.ndarray, chars: str) -> str:
     """Prefix search decoding.
 
     See dissertation of Graves, p63-66.
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
 
     Returns:
         The decoded text.
     """
 
-    blank_idx = len(labels)
+    blank_idx = len(chars)
     max_T, max_C = mat.shape
 
     # g_n and g_b: gamma in paper
@@ -41,17 +41,17 @@ def prefix_search(mat: np.ndarray, labels: str) -> str:
     while prob_ext[y_star] > prob[l_star]:
         prob_remaining = prob_ext[y_star]
 
-        # for all labels
+        # for all chars
         for k in range(max_C - 1):
-            y = y_star + labels[k]
+            y = y_star + chars[k]
             g_n[0][y] = mat[0, k] if len(y_star) == 0 else 0
             g_b[0][y] = 0
             prefix_prob = g_n[0][y]
 
             # for all time steps
             for t in range(1, max_T):
                 new_label_prob = g_b[t - 1][y_star] + (
-                    0 if y_star != '' and y_star[-1] == labels[k] else g_n[t - 1][y_star])
+                    0 if y_star != '' and y_star[-1] == chars[k] else g_n[t - 1][y_star])
                 g_n[t][y] = mat[t, k] * (new_label_prob + g_n[t - 1][y])
                 g_b[t][y] = mat[t, blank_idx] * (g_b[t - 1][y] + g_n[t - 1][y])
                 prefix_prob += mat[t, k] * new_label_prob
@@ -87,20 +87,20 @@ def prefix_search(mat: np.ndarray, labels: str) -> str:
     return l_star
 
 
-def prefix_search_heuristic_split(mat: np.ndarray, labels: str) -> str:
+def prefix_search_heuristic_split(mat: np.ndarray, chars: str) -> str:
     """Prefix search decoding with heuristic to speed up the algorithm.
 
     Speed up prefix computation by splitting sequence into subsequences as described by Graves (p66).
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
 
     Returns:
         The decoded text.
     """
 
-    blank_idx = len(labels)
+    blank_idx = len(chars)
     max_T, _ = mat.shape
 
     # split sequence into 3 subsequences, splitting points should be roughly placed at 1/3 and 2/3
@@ -124,6 +124,6 @@ def prefix_search_heuristic_split(mat: np.ndarray, labels: str) -> str:
     for i in range(len(ranges) - 1):
         beg = ranges[i]
         end = ranges[i + 1]
-        res += prefix_search(mat[beg: end, :], labels)
+        res += prefix_search(mat[beg: end, :], chars)
 
     return res
diff --git a/ctc_decoder/token_passing.py b/ctc_decoder/token_passing.py
@@ -55,29 +55,29 @@ def log(val):
     return float('-inf')
 
 
-def token_passing(mat: np.ndarray, labels: str, words: List[str]) -> str:
+def token_passing(mat: np.ndarray, chars: str, words: List[str]) -> str:
     """Token passing algorithm.
 
     See dissertation of Graves, p67-69.
 
     Args:
         mat: Output of neural network of shape TxC.
-        labels: The set of characters the neural network can recognize, excluding the CTC-blank.
+        chars: The set of characters the neural network can recognize, excluding the CTC-blank.
         words: List of words that can be recognized.
 
     Returns:
         The decoded text.
     """
 
-    blank_idx = len(labels)
+    blank_idx = len(chars)
     max_T, _ = mat.shape
 
     # special s index for beginning and end of word
     beg = 0
     end = -1
 
     # map characters to labels for each word
-    label_words = [common.word_to_label_seq(w, labels) for w in words]
+    label_words = [common.word_to_label_seq(w, chars) for w in words]
 
     # w' in paper: word with blanks in front, back and between labels: for -> _f_o_r_
     prime_words = [common.extend_by_blanks(w, blank_idx) for w in label_words]
diff --git a/tests/test_mini_example.py b/tests/test_mini_example.py
diff --git a/tests/test_real_example.py b/tests/test_real_example.py