Skip to content

Commit 93ae6c7

Browse files
committed
debug
修复了法语的《》表示引用但是会被spacy错误视为句子结束分割的bug 逐步测试各个语言中
1 parent 9e42551 commit 93ae6c7

File tree

5 files changed

+33
-49
lines changed

5 files changed

+33
-49
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,15 @@ https://github.com/user-attachments/assets/0f5d5878-bfa5-41e4-ade1-d2b81d925a7d
6262

6363
| 输入语言 | 支持程度 | 示例视频 |
6464
|---------|---------|---------|
65-
| 英语 | ⭐⭐⭐ | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
66-
| 日语 | ⭐⭐ | |
67-
| 俄语 | ⭐⭐ | |
68-
| 中文 | | |
65+
| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
66+
| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
67+
| 日语 | 😖 | |
68+
| 中文 | 😖 | |
6969
| 法语 | ❓ (尚未测试) | |
7070
| 德语 | ❓ (尚未测试) | |
7171
| 西班牙语 | ❓ (尚未测试) | |
7272

73-
- 输出语言支持:VideoLingo 支持翻译成所有语言
73+
- 输出语言支持:VideoLingo 支持翻译成claude会的所有语言
7474

7575
## 🙏 致谢
7676

config.example.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -94,33 +94,21 @@
9494
# Spacy model
9595
# Spacy 模型
9696
SPACY_MODEL_MAP = {
97-
"en": "en_core_web_sm",
98-
"zh": "zh_core_web_sm",
99-
"es": "es_core_news_lg",
100-
"fr": "fr_core_news_lg",
101-
"de": "de_core_news_lg",
102-
"it": "it_core_news_lg",
103-
"ja": "ja_core_news_lg",
104-
"pt": "pt_core_news_lg",
105-
"nl": "nl_core_news_lg",
106-
"el": "el_core_news_lg",
107-
"ru": "ru_core_news_lg",
108-
"ar": "ar_core_news_lg",
109-
"hi": "hi_core_news_lg",
110-
"ko": "ko_core_news_lg",
111-
"pl": "pl_core_news_lg",
112-
"uk": "uk_core_news_lg",
113-
"vi": "vi_core_news_lg",
114-
"tr": "tr_core_news_lg",
115-
"th": "th_core_news_lg",
116-
"ro": "ro_core_news_lg",
117-
"da": "da_core_news_lg",
118-
"fi": "fi_core_news_lg",
119-
"hu": "hu_core_news_lg",
120-
"nb": "nb_core_news_lg",
121-
"sv": "sv_core_news_lg"
97+
"en": "en_core_web_md",
98+
"ru": "ru_core_news_md",
99+
"fr": "fr_core_news_md",
100+
101+
# "es": "es_core_news_md",
102+
# "de": "de_core_news_md",
103+
# "it": "it_core_news_md",
104+
105+
106+
# Not supported
107+
# "zh": "zh_core_web_md",
108+
# "ja": "ja_core_news_md",
122109
}
123110

111+
124112
# 使用空格分割的语言
125113
# Languages that split with space
126114
LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv']

core/all_whisper_methods/whisperX.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,26 +71,27 @@ def transcribe_audio(audio_file: str) -> Dict:
7171

7272
def process_transcription(result: Dict) -> pd.DataFrame:
7373
all_words = []
74-
# save to debug as json
75-
with open('output/log/debug.json', 'a', encoding='utf-8') as f:
76-
json.dump(result, f, ensure_ascii=False, indent=4)
7774
for segment in result['segments']:
7875
for word in segment['words']:
7976
if 'start' not in word and 'end' not in word:
8077
if all_words:
81-
# 合并到前一个词
78+
# Merge with the previous word
8279
all_words[-1]['text'] = f'{all_words[-1]["text"][:-1]}{word["word"]}"'
8380
else:
84-
# 如果是第一个词,暂时保存,等待下一个有时间戳的词
81+
# If it's the first word, temporarily save it and wait for the next word with a timestamp
8582
temp_word = word["word"]
8683
else:
87-
# 正常情况,有开始和结束时间
84+
# Normal case, with start and end times
8885
word_dict = {
8986
'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
9087
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
9188
'end': word['end'],
9289
'score': word.get('score', 0)
9390
}
91+
92+
# ! For French, we need to convert guillemets to empty strings
93+
word_dict['text'] = word_dict['text'].replace('»', '').replace('«', '')
94+
9495
all_words.append(word_dict)
9596
if 'temp_word' in locals():
9697
del temp_word

core/all_whisper_methods/whisperXapi.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,14 @@ def transcribe_audio(audio_base64: str) -> Dict:
6666
except Exception as e:
6767
raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")
6868

69+
6970
def process_transcription(result: Dict) -> pd.DataFrame:
7071
all_words = []
7172
for segment in result['segments']:
7273
for word in segment['words']:
74+
# ! For French, we need to convert guillemets to empty strings
75+
word["word"] = word["word"].replace('»', '').replace('«', '')
76+
7377
if 'start' not in word and 'end' not in word:
7478
if all_words:
7579
# Merge with the previous word
@@ -80,11 +84,12 @@ def process_transcription(result: Dict) -> pd.DataFrame:
8084
else:
8185
# Normal case, with start and end times
8286
word_dict = {
83-
'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
87+
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
8488
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
8589
'end': word['end'],
8690
'score': word.get('score', 0)
8791
}
92+
8893
all_words.append(word_dict)
8994
if 'temp_word' in locals():
9095
del temp_word

core/spacy_utils/split_by_mark.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def split_by_mark(nlp):
1414
chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
1515
chunks.text = chunks.text.apply(lambda x: x.strip('"'))
1616

17-
# 用 joiner 拼接
17+
# join with joiner
1818
input_text = joiner.join(chunks.text.to_list())
1919

2020
doc = nlp(input_text)
@@ -29,15 +29,5 @@ def split_by_mark(nlp):
2929
print("💾 Sentences split by punctuation marks saved to → `sentences_by_mark.txt`")
3030

3131
if __name__ == "__main__":
32-
# nlp = init_nlp()
33-
# split_by_mark(nlp)
34-
35-
s = """そうで。"""
3632
nlp = init_nlp()
37-
doc = nlp(s)
38-
print(doc)
39-
assert doc.has_annotation("SENT_START")
40-
41-
sentences_by_mark = [sent.text for sent in doc.sents]
42-
print(sentences_by_mark)
43-
33+
split_by_mark(nlp)

0 commit comments

Comments
 (0)