several changes and improvements

marwyg · marwyg · commit cb1a4e321c25 · 2021-10-13T12:46:47.000+02:00
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -19,4 +19,4 @@ jobs:
         run: pip install flake8
 
       - name: Run flake8
-        run: flake8 vosk_cli
+        run: flake8 scripts
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 build/
 dist/
-vosk_cli.egg-info/
+scripts.egg-info/
+.idea/
diff --git a/LICENSE b/LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1 @@
-recursive-include vosk_cli/language_packs *
+recursive-include scripts/language_packs *
diff --git a/scripts/transcribe.py b/scripts/transcribe.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+from webvtt import WebVTT, Caption
+from argparse import ArgumentParser
+import os
+import subprocess
+import json
+
+language_dir_path = "/usr/share/vosk/language/"
+default_language = "eng"
+SetLogLevel(-1)
+
+MAX_CHARS_PER_LINE = 35
+MAX_LINES_IN_PARAGRAPH = 2
+
+
+def time_string(seconds):
+    minutes = seconds / 60
+    seconds = seconds % 60
+    hours = int(minutes / 60)
+    minutes = int(minutes % 60)
+    return '%i:%02i:%06.3f' % (hours, minutes, seconds)
+
+
+def write_captions_paragraph(vtt, paragraph):
+    start = time_string(paragraph[0][0]['start'])
+    end = time_string(paragraph[-1][-1]['end'])
+    content = ''
+    for fin_line in paragraph:
+        content += ' '.join([word['word'] for word in fin_line])
+        content += '\n'
+    content = content[:-1]
+    caption = Caption(start, end, content)
+    vtt.captions.append(caption)
+
+
+def write_webvtt_captions(rec_results):
+    vtt = WebVTT()
+    line = []
+    paragraph = []
+    char_count = 0
+    for i, rec_result in enumerate(rec_results):
+        result = json.loads(rec_result).get('result')
+        if not result:
+            continue
+
+        # main logic for the captions "format" (words per line and lines per paragraph)
+        for entry in result:
+            char_count += len(entry['word'])
+            if char_count > MAX_CHARS_PER_LINE and len(line) != 0:
+                if len(paragraph) == MAX_LINES_IN_PARAGRAPH:
+                    write_captions_paragraph(vtt, paragraph)
+                    paragraph = [line]
+                    line = [entry]
+                    char_count = len(entry['word'])
+                    continue
+                else:
+                    paragraph.append(line)
+                    line = [entry]
+                    char_count = len(entry['word'])
+            else:
+                line.append(entry)
+                char_count += 1  # add 1 because of whitespace
+
+    # write the remaining words into the captions file
+    if len(paragraph) != 0:
+        if len(paragraph) < MAX_LINES_IN_PARAGRAPH:
+            paragraph.append(line)  # append the last line with remaining words
+            line = []
+        write_captions_paragraph(vtt, paragraph)
+        paragraph = []
+    if len(line) != 0:
+        paragraph = [line]
+        write_captions_paragraph(vtt, paragraph)
+
+    return vtt
+
+
+def transcribe(inputFile, outputFile, language):
+    default_language_dir_path = language_dir_path + default_language
+    chosen_language_dir_path = language_dir_path + language
+    chosen_model = None
+
+    # checks if there is a model directory with a language code as the name    
+    if not os.path.exists(chosen_language_dir_path):
+        print('Did not find language model directory "%s".' % chosen_language_dir_path)
+        print('Using default language model directory "%s".' % default_language_dir_path)
+        if not os.path.exists(default_language_dir_path):
+            print('Did not found default model directory "%s".' % default_language_dir_path)
+            exit(1)
+        else:
+            chosen_model = default_language_dir_path
+    else:
+        chosen_model = chosen_language_dir_path
+
+    print('Start transcribing...')
+    sample_rate = 16000
+    model = Model(chosen_model)
+    rec = KaldiRecognizer(model, sample_rate)
+    rec.SetWords(True)
+
+    command = ['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', inputFile,
+               '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-']
+    process = subprocess.Popen(command, stdout=subprocess.PIPE)
+
+    rec_results = []
+    while True:
+        data = process.stdout.read(4000)
+        if len(data) == 0:
+            break
+        if rec.AcceptWaveform(data):
+            rec_results.append(rec.Result())
+
+    rec_results.append(rec.FinalResult())
+    vtt = write_webvtt_captions(rec_results)
+
+    # save webvtt
+    print('Finished transcribing. Saving WebVTT file...')
+    vtt.save(outputFile)
+    print('WebVTT saved.')
+    # print(vtt.content)
+
+
+def main():
+    parser = ArgumentParser(description='Creates a WebVTT file out of a '
+                            'media file with an audio track.')
+    parser.add_argument('-i', type=str, dest='inputFile', required=True,
+                        help='Path to the media file to transcribed.')
+    parser.add_argument('-o', type=str, dest='outputFile', required=True,
+                        help='The path to the output file.')
+    parser.add_argument('-l', type=str, dest='language', required=True,
+                        help='The language code. It determines which model '
+                        'will be used to transcribe the media file.')
+    args = parser.parse_args()
+
+    inputFile = args.inputFile
+    outputFile = args.outputFile
+    language = args.language
+
+    transcribe(inputFile, outputFile, language)
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,3 @@
+[bdist_rpm]
+build_requires = python3-devel python3-pip python3-setuptools
+requires = python3-webvtt-py python3-vosk vosk-cli-lang-eng ffmpeg
diff --git a/setup.py b/setup.py
@@ -2,23 +2,18 @@
 
 description_text = 'This project serves as an Vosk interface for Opencast. It allows to generate subtitles (WebVVT files) from video and audio sources via Vosk.'
 
-setup(name='vosk_cli',
+setup(name='vosk-cli',
       version='0.1',
       description=description_text,
       long_description=description_text,
       url='https://github.com/elan-ev/vosk-cli',
       author='Martin Wygas',
       author_email='mwygas@uos.de',
       license='Apache-2.0',
-      packages=['vosk_cli'],
-      license_files = ('LICENSE'),
-      install_requires=[
-		    'vosk>=0.3.30',
-		    'webvtt-py>=0.4.6'
-      ],
-      include_package_data = True,
+      packages=['scripts'],
+      license_files=('LICENSE'),
+      include_package_data=True,
       zip_safe=False,
-      entry_points = {
-        'console_scripts': ['vosk-cli=vosk_cli.transcribe:main'],
-    }
-)
+      entry_points={
+          'console_scripts': ['vosk-cli=scripts.transcribe:main'],
+      })
diff --git a/vosk_cli/transcribe.py b/vosk_cli/transcribe.py

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 build/
 dist/
 -vosk_cli.egg-info/
 +scripts.egg-info/
 +.idea/
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-recursive-include vosk_cli/language_packs *`
	`1`	`+recursive-include scripts/language_packs *`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[bdist_rpm]`
	`2`	`+build_requires = python3-devel python3-pip python3-setuptools`
	`3`	`+requires = python3-webvtt-py python3-vosk vosk-cli-lang-eng ffmpeg`