Skip to content

Commit cb1a4e3

Browse files
committed
several changes and improvements
1 parent 006ac65 commit cb1a4e3

File tree

8 files changed

+155
-118
lines changed

8 files changed

+155
-118
lines changed

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@ jobs:
1919
run: pip install flake8
2020

2121
- name: Run flake8
22-
run: flake8 vosk_cli
22+
run: flake8 scripts

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
build/
22
dist/
3-
vosk_cli.egg-info/
3+
scripts.egg-info/
4+
.idea/

LICENSE

100755100644
File mode changed.

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
recursive-include vosk_cli/language_packs *
1+
recursive-include scripts/language_packs *

scripts/transcribe.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python3
2+
3+
from vosk import Model, KaldiRecognizer, SetLogLevel
4+
from webvtt import WebVTT, Caption
5+
from argparse import ArgumentParser
6+
import os
7+
import subprocess
8+
import json
9+
10+
language_dir_path = "/usr/share/vosk/language/"
11+
default_language = "eng"
12+
SetLogLevel(-1)
13+
14+
MAX_CHARS_PER_LINE = 35
15+
MAX_LINES_IN_PARAGRAPH = 2
16+
17+
18+
def time_string(seconds):
19+
minutes = seconds / 60
20+
seconds = seconds % 60
21+
hours = int(minutes / 60)
22+
minutes = int(minutes % 60)
23+
return '%i:%02i:%06.3f' % (hours, minutes, seconds)
24+
25+
26+
def write_captions_paragraph(vtt, paragraph):
27+
start = time_string(paragraph[0][0]['start'])
28+
end = time_string(paragraph[-1][-1]['end'])
29+
content = ''
30+
for fin_line in paragraph:
31+
content += ' '.join([word['word'] for word in fin_line])
32+
content += '\n'
33+
content = content[:-1]
34+
caption = Caption(start, end, content)
35+
vtt.captions.append(caption)
36+
37+
38+
def write_webvtt_captions(rec_results):
39+
vtt = WebVTT()
40+
line = []
41+
paragraph = []
42+
char_count = 0
43+
for i, rec_result in enumerate(rec_results):
44+
result = json.loads(rec_result).get('result')
45+
if not result:
46+
continue
47+
48+
# main logic for the captions "format" (words per line and lines per paragraph)
49+
for entry in result:
50+
char_count += len(entry['word'])
51+
if char_count > MAX_CHARS_PER_LINE and len(line) != 0:
52+
if len(paragraph) == MAX_LINES_IN_PARAGRAPH:
53+
write_captions_paragraph(vtt, paragraph)
54+
paragraph = [line]
55+
line = [entry]
56+
char_count = len(entry['word'])
57+
continue
58+
else:
59+
paragraph.append(line)
60+
line = [entry]
61+
char_count = len(entry['word'])
62+
else:
63+
line.append(entry)
64+
char_count += 1 # add 1 because of whitespace
65+
66+
# write the remaining words into the captions file
67+
if len(paragraph) != 0:
68+
if len(paragraph) < MAX_LINES_IN_PARAGRAPH:
69+
paragraph.append(line) # append the last line with remaining words
70+
line = []
71+
write_captions_paragraph(vtt, paragraph)
72+
paragraph = []
73+
if len(line) != 0:
74+
paragraph = [line]
75+
write_captions_paragraph(vtt, paragraph)
76+
77+
return vtt
78+
79+
80+
def transcribe(inputFile, outputFile, language):
81+
default_language_dir_path = language_dir_path + default_language
82+
chosen_language_dir_path = language_dir_path + language
83+
chosen_model = None
84+
85+
# checks if there is a model directory with a language code as the name
86+
if not os.path.exists(chosen_language_dir_path):
87+
print('Did not find language model directory "%s".' % chosen_language_dir_path)
88+
print('Using default language model directory "%s".' % default_language_dir_path)
89+
if not os.path.exists(default_language_dir_path):
90+
print('Did not found default model directory "%s".' % default_language_dir_path)
91+
exit(1)
92+
else:
93+
chosen_model = default_language_dir_path
94+
else:
95+
chosen_model = chosen_language_dir_path
96+
97+
print('Start transcribing...')
98+
sample_rate = 16000
99+
model = Model(chosen_model)
100+
rec = KaldiRecognizer(model, sample_rate)
101+
rec.SetWords(True)
102+
103+
command = ['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', inputFile,
104+
'-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-']
105+
process = subprocess.Popen(command, stdout=subprocess.PIPE)
106+
107+
rec_results = []
108+
while True:
109+
data = process.stdout.read(4000)
110+
if len(data) == 0:
111+
break
112+
if rec.AcceptWaveform(data):
113+
rec_results.append(rec.Result())
114+
115+
rec_results.append(rec.FinalResult())
116+
vtt = write_webvtt_captions(rec_results)
117+
118+
# save webvtt
119+
print('Finished transcribing. Saving WebVTT file...')
120+
vtt.save(outputFile)
121+
print('WebVTT saved.')
122+
# print(vtt.content)
123+
124+
125+
def main():
126+
parser = ArgumentParser(description='Creates a WebVTT file out of a '
127+
'media file with an audio track.')
128+
parser.add_argument('-i', type=str, dest='inputFile', required=True,
129+
help='Path to the media file to transcribed.')
130+
parser.add_argument('-o', type=str, dest='outputFile', required=True,
131+
help='The path to the output file.')
132+
parser.add_argument('-l', type=str, dest='language', required=True,
133+
help='The language code. It determines which model '
134+
'will be used to transcribe the media file.')
135+
args = parser.parse_args()
136+
137+
inputFile = args.inputFile
138+
outputFile = args.outputFile
139+
language = args.language
140+
141+
transcribe(inputFile, outputFile, language)

setup.cfg

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[bdist_rpm]
2+
build_requires = python3-devel python3-pip python3-setuptools
3+
requires = python3-webvtt-py python3-vosk vosk-cli-lang-eng ffmpeg

setup.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,18 @@
22

33
description_text = 'This project serves as an Vosk interface for Opencast. It allows to generate subtitles (WebVVT files) from video and audio sources via Vosk.'
44

5-
setup(name='vosk_cli',
5+
setup(name='vosk-cli',
66
version='0.1',
77
description=description_text,
88
long_description=description_text,
99
url='https://github.com/elan-ev/vosk-cli',
1010
author='Martin Wygas',
1111
author_email='[email protected]',
1212
license='Apache-2.0',
13-
packages=['vosk_cli'],
14-
license_files = ('LICENSE'),
15-
install_requires=[
16-
'vosk>=0.3.30',
17-
'webvtt-py>=0.4.6'
18-
],
19-
include_package_data = True,
13+
packages=['scripts'],
14+
license_files=('LICENSE'),
15+
include_package_data=True,
2016
zip_safe=False,
21-
entry_points = {
22-
'console_scripts': ['vosk-cli=vosk_cli.transcribe:main'],
23-
}
24-
)
17+
entry_points={
18+
'console_scripts': ['vosk-cli=scripts.transcribe:main'],
19+
})

vosk_cli/transcribe.py

Lines changed: 0 additions & 103 deletions
This file was deleted.

0 commit comments

Comments
 (0)