Skip to content

Commit a59bdf9

Browse files
authored
Merge pull request #45 from Huanshere/dev_v7
update language support
2 parents 1f9e302 + c18aa8b commit a59bdf9

File tree

10 files changed

+122
-135
lines changed

10 files changed

+122
-135
lines changed

README.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,20 @@ https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
6363

6464
- 音频长度:目前仅支持30分钟以内的视频,我们计划很快扩展这一限制。
6565

66-
- 输入语言支持(whisperX对部分语言产出的时间轴和标点不稳定)
66+
- 输入语言支持:
6767

6868
| 输入语言 | 支持程度 | 示例视频 |
6969
|---------|---------|---------|
70-
| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
71-
| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
72-
| 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
73-
| 德语 | ❓ (尚未测试) | |
74-
| 西班牙语 | ❓ (尚未测试) | |
75-
| 日语 | 😖 ||
76-
| 中文 | 😖 ||
70+
| 🇬🇧🇺🇸 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
71+
| 🇷🇺 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
72+
| 🇫🇷 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
73+
| 🇩🇪 德语 | 🤩 | [德转中 demo](https://github.com/user-attachments/assets/07cb9d21-069e-4725-871d-c4d9701287a3) |
74+
| 🇮🇹 意大利语 | 🤩 | [意转中 demo](https://github.com/user-attachments/assets/f1f893eb-dad3-4460-aaf6-10cac999195e) |
75+
| 🇪🇸 西班牙语 | 🤩 | [西转中 demo](https://github.com/user-attachments/assets/c1d28f1c-83d2-4f13-a1a1-859bd6cc3553) |
76+
| 🇯🇵 日语 | 😐 | [日转中 demo](https://github.com/user-attachments/assets/856c3398-2da3-4e25-9c36-27ca2d1f68c2) |
77+
| 🇨🇳 中文 | 😖 ||
78+
79+
😖 whisper 识别中文字词级时间轴时难以给出标点符号。
7780

7881
- 输出语言支持:VideoLingo 支持翻译成claude会的所有语言
7982

config.example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
# 为了最好的效果,请使用 claude-3-5-sonnet-20240620. 便宜渠道推荐使用 https://api2.wlai.vip/register?aff=TXMB.
88
# For best results, please use claude-3-5-sonnet-20240620.
99
API_KEY = 'sk-xxx'
10-
BASE_URL = 'https://api2.wlai.vip'
10+
BASE_URL = 'https://api.wlai.vip'
1111
MODEL = ['claude-3-5-sonnet-20240620']
1212

1313
# Replicate API 设置
1414
# Replicate API settings for using whisperX
15-
REPLICATE_API_TOKEN = "r8_xxx"
15+
REPLICATE_API_TOKEN = 'r8_xxx'
1616

1717
# 语言设置,用自然语言描述
1818
# Language settings, described in natural language
@@ -96,15 +96,14 @@
9696
"en": "en_core_web_md",
9797
"ru": "ru_core_news_md",
9898
"fr": "fr_core_news_md",
99-
100-
# "es": "es_core_news_md",
101-
# "de": "de_core_news_md",
102-
# "it": "it_core_news_md",
99+
"ja": "ja_core_news_md",
100+
"es": "es_core_news_md",
101+
"de": "de_core_news_md",
102+
"it": "it_core_news_md",
103103

104-
105104
# Not supported
106105
# "zh": "zh_core_web_md",
107-
# "ja": "ja_core_news_md",
106+
108107
}
109108

110109
# 使用空格分割的语言

core/all_whisper_methods/whisperX.py

Lines changed: 1 addition & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,38 +5,10 @@
55
import pandas as pd
66
import json
77
from typing import Dict
8-
import subprocess
9-
import base64
108

119
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
1210
from config import MODEL_DIR
13-
14-
def convert_video_to_audio(input_file: str) -> str:
15-
os.makedirs('output/audio', exist_ok=True)
16-
audio_file = 'output/audio/raw_full_audio.wav'
17-
18-
if not os.path.exists(audio_file):
19-
ffmpeg_cmd = [
20-
'ffmpeg',
21-
'-i', input_file,
22-
'-vn',
23-
'-acodec', 'libmp3lame',
24-
'-ar', '16000',
25-
'-b:a', '64k',
26-
audio_file
27-
]
28-
print(f"🎬➡️🎵 Converting to audio......")
29-
subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
30-
print(f"🎬➡️🎵 Converted <{input_file}> to <{audio_file}>\n")
31-
32-
return audio_file
33-
34-
def encode_file_to_base64(file_path: str) -> str:
35-
print("🔄 Encoding audio file to base64...")
36-
with open(file_path, 'rb') as file:
37-
encoded = base64.b64encode(file.read()).decode('utf-8')
38-
print("✅ File successfully encoded to base64")
39-
return encoded
11+
from core.all_whisper_methods.whisperXapi import process_transcription, convert_video_to_audio
4012

4113
def transcribe_audio(audio_file: str) -> Dict:
4214
from config import WHISPER_LANGUAGE
@@ -69,39 +41,6 @@ def transcribe_audio(audio_file: str) -> Dict:
6941
except Exception as e:
7042
raise Exception(f"WhisperX processing error: {e}")
7143

72-
def process_transcription(result: Dict) -> pd.DataFrame:
73-
from config import get_joiner, WHISPER_LANGUAGE
74-
language = result['language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
75-
joiner = get_joiner(language)
76-
77-
all_words = []
78-
for segment in result['segments']:
79-
for word in segment['words']:
80-
# ! For French, we need to convert guillemets to empty strings
81-
word["word"] = word["word"].replace('»', '').replace('«', '')
82-
83-
if 'start' not in word and 'end' not in word:
84-
if all_words:
85-
# Merge with the previous word
86-
all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
87-
else:
88-
# If it's the first word, temporarily save it and wait for the next word with a timestamp
89-
temp_word = word["word"]
90-
else:
91-
# Normal case, with start and end times
92-
word_dict = {
93-
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
94-
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
95-
'end': word['end'],
96-
'score': word.get('score', 0)
97-
}
98-
99-
all_words.append(word_dict)
100-
if 'temp_word' in locals():
101-
del temp_word
102-
103-
return pd.DataFrame(all_words)
104-
10544
def save_results(df: pd.DataFrame):
10645
os.makedirs('output/log', exist_ok=True)
10746
excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")

core/all_whisper_methods/whisperXapi.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,6 @@ def transcribe_audio(audio_base64: str) -> Dict:
6767
raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")
6868

6969
def process_transcription(result: Dict) -> pd.DataFrame:
70-
from config import get_joiner, WHISPER_LANGUAGE
71-
language = result['detected_language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
72-
joiner = get_joiner(language)
73-
7470
all_words = []
7571
for segment in result['segments']:
7672
for word in segment['words']:
@@ -79,23 +75,34 @@ def process_transcription(result: Dict) -> pd.DataFrame:
7975

8076
if 'start' not in word and 'end' not in word:
8177
if all_words:
82-
# Merge with the previous word
83-
all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
78+
# Assign the end time of the previous word as the start and end time of the current word
79+
word_dict = {
80+
'text': word["word"],
81+
'start': all_words[-1]['end'],
82+
'end': all_words[-1]['end'],
83+
}
84+
all_words.append(word_dict)
8485
else:
85-
# If it's the first word, temporarily save it and wait for the next word with a timestamp
86-
temp_word = word["word"]
86+
# If it’s the first word, look next for a timestamp then assign it to the current word
87+
next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
88+
if next_word:
89+
word_dict = {
90+
'text': word["word"],
91+
'start': next_word["start"],
92+
'end': next_word["end"],
93+
}
94+
all_words.append(word_dict)
95+
else:
96+
raise Exception(f"No next word with timestamp found for the current word : {word}")
8797
else:
8898
# Normal case, with start and end times
8999
word_dict = {
90-
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
100+
'text': f'{word["word"]}',
91101
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
92102
'end': word['end'],
93-
'score': word.get('score', 0)
94103
}
95104

96105
all_words.append(word_dict)
97-
if 'temp_word' in locals():
98-
del temp_word
99106

100107
return pd.DataFrame(all_words)
101108

core/ask_gpt.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def make_api_call(client, model, messages, response_format):
5858
response_format=response_format
5959
)
6060

61-
def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'):
61+
def ask_gpt(prompt, model, response_json=True, valid_key='', valid_sub_key='', log_title='default'):
6262
with LOCK:
6363
if check_ask_gpt_history(prompt, model):
6464
return check_ask_gpt_history(prompt, model)
@@ -82,6 +82,10 @@ def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'
8282
if valid_key and valid_key not in response_data:
8383
print(f"❎ API response error: Missing '{valid_key}' key. Retrying...")
8484
raise ValueError(f"Response missing '{valid_key}' key")
85+
if valid_sub_key:
86+
if not all(valid_sub_key in item for item in response_data.values()):
87+
print(f"❎ API response error: Missing '{valid_sub_key}' sub-key in some items. Retrying...")
88+
raise ValueError(f"Response missing '{valid_sub_key}' sub-key in some items")
8589
break # Successfully accessed and parsed, break the loop
8690
except Exception as e:
8791
response_data = response.choices[0].message.content

core/prompts_storage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ def get_prompt_faithfulness(lines, shared_prompt):
179179
return prompt_faithfulness.strip()
180180

181181

182-
183182
def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
184183
from config import TARGET_LANGUAGE
185184
json_format = {}
@@ -227,6 +226,7 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
227226
</subtitles>
228227
229228
### Output Format
229+
Make sure to generate the correct Json format, don't output " in the value.
230230
Please complete the following JSON data, where << >> represents placeholders that should not appear in your answer, and return your translation results in JSON format:
231231
{json.dumps(json_format, ensure_ascii=False, indent=4)}
232232
'''

core/spacy_utils/load_nlp_model.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,10 @@ def init_nlp():
2121
nlp = spacy.load(model)
2222
except:
2323
print(f"Downloading {model} model...")
24+
print("If download failed, please check your network and try again.")
2425
download(model)
2526
nlp = spacy.load(model)
2627
except:
27-
print(f"Language not detected, using en_core_web_sm model as fallback...")
28-
model = "en_core_web_sm"
29-
try:
30-
nlp = spacy.load(model)
31-
except:
32-
print(f"Downloading {model} model...")
33-
download(model)
34-
nlp = spacy.load(model)
28+
raise ValueError(f"❌ Failed to load NLP Spacy model: {model}")
3529
print(f"✅ NLP Spacy model loaded successfully!")
3630
return nlp

core/spacy_utils/split_by_connector.py

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,27 +9,67 @@ def analyze_connectors(doc, token):
99
Analyze whether a token is a connector that should trigger a sentence split.
1010
1111
Processing logic and order:
12-
1. Check if the token is one of the target connectors (that, which, where, when).
13-
2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
14-
3. For all connectors, check if they function as a 'mark' dependent of a verb.
15-
4. For 'which', 'where', 'when', check if they function as determiners or pronouns
16-
for nouns or proper nouns.
17-
5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
18-
6. For 'and', 'or', 'but', check if they connect two independent clauses.
12+
1. Check if the token is one of the target connectors based on the language.
13+
2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll).
14+
3. For all connectors, check if they function as a specific dependency of a verb or noun.
15+
4. Default to splitting for certain connectors if no other conditions are met.
16+
5. For coordinating conjunctions, check if they connect two independent clauses.
1917
"""
20-
# Check if the token is one of the target connectors
21-
if token.text.lower() not in ["that", "which", "where", "when", "because", "but", "and", "or"]:
18+
lang = doc.lang_
19+
if lang == "en":
20+
connectors = ["that", "which", "where", "when", "because", "but", "and", "or"]
21+
mark_dep = "mark"
22+
det_pron_deps = ["det", "pron"]
23+
verb_pos = "VERB"
24+
noun_pos = ["NOUN", "PROPN"]
25+
elif lang == "ja":
26+
connectors = ["けれども", "しかし", "だから", "それで", "ので", "のに", "ため"]
27+
mark_dep = "mark"
28+
det_pron_deps = ["case"]
29+
verb_pos = "VERB"
30+
noun_pos = ["NOUN", "PROPN"]
31+
elif lang == "fr":
32+
connectors = ["que", "qui", "où", "quand", "parce que", "mais", "et", "ou"]
33+
mark_dep = "mark"
34+
det_pron_deps = ["det", "pron"]
35+
verb_pos = "VERB"
36+
noun_pos = ["NOUN", "PROPN"]
37+
elif lang == "ru":
38+
connectors = ["что", "который", "где", "когда", "потому что", "но", "и", "или"]
39+
mark_dep = "mark"
40+
det_pron_deps = ["det"]
41+
verb_pos = "VERB"
42+
noun_pos = ["NOUN", "PROPN"]
43+
elif lang == "es":
44+
connectors = ["que", "cual", "donde", "cuando", "porque", "pero", "y", "o"]
45+
mark_dep = "mark"
46+
det_pron_deps = ["det", "pron"]
47+
verb_pos = "VERB"
48+
noun_pos = ["NOUN", "PROPN"]
49+
elif lang == "de":
50+
connectors = ["dass", "welche", "wo", "wann", "weil", "aber", "und", "oder"]
51+
mark_dep = "mark"
52+
det_pron_deps = ["det", "pron"]
53+
verb_pos = "VERB"
54+
noun_pos = ["NOUN", "PROPN"]
55+
elif lang == "it":
56+
connectors = ["che", "quale", "dove", "quando", "perché", "ma", "e", "o"]
57+
mark_dep = "mark"
58+
det_pron_deps = ["det", "pron"]
59+
verb_pos = "VERB"
60+
noun_pos = ["NOUN", "PROPN"]
61+
else:
62+
return False, False
63+
64+
if token.text.lower() not in connectors:
2265
return False, False
2366

24-
if token.text.lower() == "that":
25-
if token.dep_ == "mark" and token.head.pos_ == "VERB":
26-
# Split if 'that' is a 'mark' dependent of a verb
67+
if lang == "en" and token.text.lower() == "that":
68+
if token.dep_ == mark_dep and token.head.pos_ == verb_pos:
2769
return True, False
2870
else:
29-
# Don't split for other uses of 'that'
3071
return False, False
31-
elif token.text.lower() != "that" and token.dep_ in ["det", "pron"] and token.head.pos_ in ["NOUN", "PROPN"]:
32-
# Don't split if 'which', 'where', 'when' are determiners or pronouns for nouns
72+
elif token.dep_ in det_pron_deps and token.head.pos_ in noun_pos:
3373
return False, False
3474
else:
3575
return True, False

core/spacy_utils/split_by_mark.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,13 @@ def split_by_mark(nlp):
2323
sentences_by_mark = [sent.text for sent in doc.sents]
2424

2525
with open("output/log/sentence_by_mark.txt", "w", encoding="utf-8") as output_file:
26-
for sentence in sentences_by_mark:
27-
output_file.write(sentence + "\n")
26+
for i, sentence in enumerate(sentences_by_mark):
27+
if i > 0 and sentence.strip() in [',', '.', ',', '。', '?', '!']:
28+
# ! If the current line contains only punctuation, merge it with the previous line, this happens in Chinese, Japanese, etc.
29+
output_file.seek(output_file.tell() - 1, os.SEEK_SET) # Move to the end of the previous line
30+
output_file.write(sentence) # Add the punctuation
31+
else:
32+
output_file.write(sentence + "\n")
2833

2934
print("💾 Sentences split by punctuation marks saved to → `sentences_by_mark.txt`")
3035

0 commit comments

Comments
 (0)