update

undertheseanlp · Oct 27, 2024 · 92dd0fe · 92dd0fe
1 parent 68b3db2
commit 92dd0fe
Show file tree

Hide file tree

Showing 12 changed files with 3,625 additions and 27 deletions.
diff --git a/apps/languagesv2/languages-v2-web/src/pages/AudiosData.tsx b/apps/languagesv2/languages-v2-web/src/pages/AudiosData.tsx
diff --git a/apps/languagesv2/languages-v2-web/src/pages/Home.tsx b/apps/languagesv2/languages-v2-web/src/pages/Home.tsx
@@ -1,17 +1,24 @@
 import React, { useState } from 'react';
 import VietnameseWords, { VietnameseWord } from './VietnameseWords';
 import { BUILD_VERSION, DEVELOPMENT_TEAM } from './Config';
-
-// Example build version, replace this with an environment variable if needed.
+import Audios from './AudiosData';
 
 const frequentVietnameseWords = VietnameseWords;
 
 function Home() {
   const [hoveredIndex, setHoveredIndex] = useState<number | null>(null);
   const [hoveredType, setHoveredType] = useState<string | null>(null);
 
-  const playSound = (index: number, type: string) => {
-    const audio = new Audio(`/audio/word-${index}-${type}.mp3`);
+  const playSound = (word: VietnameseWord, speaker_id: number) => {
+    const wordItem = Audios.find((audio) => audio.word === word.word && audio.speaker_id === speaker_id);
+    if (!wordItem) {
+      console.warn('Audio not found for the given word and speaker');
+      return;
+    }
+
+    const audioUrl = `https://undertheseanlp.com/data/audios/${wordItem.audio_id}.wav`;
+    const audio = new Audio(audioUrl);
+
     audio.play();
   };
 
@@ -43,7 +50,7 @@ function Home() {
                 <div
                   className="absolute top-0 left-0 h-2 rounded-t-xl bg-gradient-to-r from-green-400 via-blue-500 to-indigo-600 shadow-md"
                   style={{
-                    width: `${Math.min(frequency * 10, 100)}%`,
+                    width: `${Math.max(Math.min(100 - frequency * 10, 100), 0)}%`,
                     transition: 'width 0.3s ease-in-out',
                     boxShadow: '0px 4px 10px rgba(0, 0, 0, 0.2)',
                   }}
@@ -52,13 +59,13 @@ function Home() {
                 <p className="italic text-gray-500 mb-4">{partOfSpeech}</p>
                 <div className="flex justify-center space-x-6 mt-4">
                   <button
-                    onClick={() => playSound(index, 'pronunciation')}
-                    onMouseEnter={() => handleMouseEnter(index, 'pronunciation')}
+                    onClick={() => playSound(wordItem, 2)} // Northern sound
+                    onMouseEnter={() => handleMouseEnter(index, 'northern')}
                     onMouseLeave={handleMouseLeave}
                     className="text-indigo-600 hover:text-indigo-800 focus:outline-none transition-colors transform hover:scale-110 duration-300"
-                    aria-label="North Pronunciation"
+                    aria-label="Northern Sound"
                   >
-                    {hoveredIndex === index && hoveredType === 'pronunciation' ? (
+                    {hoveredIndex === index && hoveredType === 'northern' ? (
                       <span className="bg-indigo-600 text-white px-2 py-1 rounded-lg transition-opacity duration-300 ease-in-out">
                         NTH
                       </span>
@@ -67,13 +74,13 @@ function Home() {
                     )}
                   </button>
                   <button
-                    onClick={() => playSound(index, 'example')}
-                    onMouseEnter={() => handleMouseEnter(index, 'example')}
+                    onClick={() => playSound(wordItem, 1)} // Southern sound
+                    onMouseEnter={() => handleMouseEnter(index, 'southern')}
                     onMouseLeave={handleMouseLeave}
                     className="text-pink-600 hover:text-pink-800 focus:outline-none transition-colors transform hover:scale-110 duration-300"
-                    aria-label="South Pronunciation"
+                    aria-label="Southern Sound"
                   >
-                    {hoveredIndex === index && hoveredType === 'example' ? (
+                    {hoveredIndex === index && hoveredType === 'southern' ? (
                       <span className="bg-pink-600 text-white px-2 py-1 rounded-lg transition-opacity duration-300 ease-in-out">
                         STH
                       </span>

diff --git a/apps/languagesv2/languages-v2-web/src/pages/VietnameseWords.tsx b/apps/languagesv2/languages-v2-web/src/pages/VietnameseWords.tsx
@@ -4,7 +4,7 @@ export interface VietnameseWord {
   frequency: number;
 }
 
-const VietnameseWords =   [
+const VietnameseWords = [
   { word: "và", partOfSpeech: "", frequency: 0 },
   { word: "của", partOfSpeech: "", frequency: 0 },
   { word: "là", partOfSpeech: "", frequency: 0 },

diff --git a/apps/languagesv2/scripts/.env.example b/apps/languagesv2/scripts/.env.example
@@ -0,0 +1,5 @@
+UTS_HOST=
+UTS_USERNAME=
+UTS_PASSWORD=
+UTS_HOST=
+ZALO_API_KEY=
diff --git a/apps/languagesv2/scripts/.gitignore b/apps/languagesv2/scripts/.gitignore
@@ -1 +1,2 @@
-data
+data
+.env
diff --git a/apps/languagesv2/scripts/README.md b/apps/languagesv2/scripts/README.md
@@ -0,0 +1,8 @@
+# README
+
+Run flake8 before commit
+
+```
+black .
+flake8 .  --max-complexity 10 --ignore E501,W504,W605
+```
diff --git a/apps/languagesv2/scripts/__init__.py b/apps/languagesv2/scripts/__init__.py
diff --git a/apps/languagesv2/scripts/models.py b/apps/languagesv2/scripts/models.py
@@ -0,0 +1,8 @@
+class Word:
+    def __init__(self, id, text, freq):
+        self.id = id
+        self.text = text
+        self.freq = freq
+
+    def __repr__(self):
+        return f"Word(id={self.id}, text='{self.text}', freq={self.freq})"
diff --git a/apps/languagesv2/scripts/requirements.txt b/apps/languagesv2/scripts/requirements.txt
@@ -0,0 +1,2 @@
+requests
+python-dotenv
diff --git a/apps/languagesv2/scripts/script_extract_2000_words.py b/apps/languagesv2/scripts/script_extract_2000_words.py
@@ -1,21 +1,14 @@
 import json
-
-
-class Word:
-    def __init__(self, id, text, freq):
-        self.id = id
-        self.text = text
-        self.freq = freq
-
-    def __repr__(self):
-        return f"Word(id={self.id}, text='{self.text}', freq={self.freq})"
+from models import Word
 
 
 def normalize_text(text):
     return text.lower()
 
 
-def extract_words_from_freq_file(n=2000, data_folder="/workspaces/data", filename="freq_vie_1M_2018-freq.txt"):
+def extract_words_from_freq_file(
+    n=2000, data_folder="/workspaces/data", filename="freq_vie_1M_2018-freq.txt"
+):
     """Extracts words from a frequency file and returns a list of Word objects."""
     words = []
     with open(f"{data_folder}/{filename}", "r", encoding="utf-8") as file:
@@ -36,7 +29,9 @@ def export_words_to_js_file(words, output_file="data/VietnameseWords.js"):
     with open(output_file, "w", encoding="utf-8") as file:
         file.write("const VietnameseWords = [\n")
         for word in words:
-            file.write(f'  {{ word: "{word.text}", partOfSpeech: "", frequency: {word.freq} }},\n')
+            file.write(
+                f'  {{ word: "{word.text}", partOfSpeech: "", frequency: {word.freq} }},\n'
+            )
         file.write("];\n")
     print(f"File '{output_file}' created successfully with {len(words)} entries.")
 

diff --git a/apps/languagesv2/scripts/script_tts.py b/apps/languagesv2/scripts/script_tts.py
@@ -0,0 +1,207 @@
+import json
+import os
+import requests
+import uuid
+from models import Word
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Directory setup
+AUDIO_DIR = "/workspaces/data/audios"
+AUDIOS_FILE_PATH = "data/Audios.jsonl"
+AUDIOS_TSX_FILE_PATH = "data/Audios.tsx"
+ZALO_API_KEY = os.getenv("ZALO_API_KEY")
+
+
+def load_existing_metadata():
+    """Load existing metadata into a set for fast lookup."""
+    if not os.path.exists(AUDIOS_FILE_PATH):
+        return set()
+
+    existing_metadata = set()
+    with open(AUDIOS_FILE_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                data = json.loads(line)
+                existing_metadata.add((data["word"], data["speaker_id"]))
+            except json.JSONDecodeError:
+                continue  # Skip any malformed JSON lines
+    return existing_metadata
+
+
+# Initialize metadata set
+existing_metadata = load_existing_metadata()
+
+
+def generate_key(word, speaker_id):
+    """Generate a unique UUIDv3 key for the combination of word and speaker_id."""
+    return str(uuid.uuid3(uuid.NAMESPACE_DNS, f"{word}_{speaker_id}"))
+
+
+def is_downloaded(key):
+    """Check if an audio file with a specific key already exists in the audio folder."""
+    return os.path.exists(os.path.join(AUDIO_DIR, f"{key}.wav"))
+
+
+def save_audio_metadata(word, speaker_id, audio_id):
+    """Save audio metadata to a JSONL file if it doesn't already exist."""
+    if (word, speaker_id) not in existing_metadata:
+        with open(AUDIOS_FILE_PATH, "a", encoding="utf-8") as f:
+            json_line = json.dumps(
+                {"word": word, "speaker_id": speaker_id, "audio_id": audio_id},
+                ensure_ascii=False,
+            )
+            f.write(json_line + "\n")
+        existing_metadata.add((word, speaker_id))  # Update the set
+        print(f"Metadata saved for '{word}' with speaker_id {speaker_id}")
+    else:
+        print(f"Metadata already exists for '{word}' with speaker_id {speaker_id}")
+
+
+def tts(text, speaker_ids=[1], encode_type=0):
+    """
+    Synthesize speech for the given text if not already downloaded.
+
+    Args:
+        text (str): Text to be synthesized into speech.
+        speaker_ids (list[int]): List of IDs representing speaker voice types.
+            1 - South women
+            2 - Northern women
+            3 - South men
+            4 - Northern men
+        encode_type (int): Encoding type for the speech synthesis (default is 0).
+
+    Returns:
+        dict: Dictionary mapping speaker IDs to paths of the synthesized audio files.
+    """
+    file_paths = {}
+
+    for speaker_id in speaker_ids:
+        # Generate key for file naming
+        key = generate_key(text, speaker_id)
+
+        # Check if the audio is already downloaded
+        file_path = os.path.join(AUDIO_DIR, f"{key}.wav")
+        if is_downloaded(key):
+            # Verify if file size is greater than zero
+            if os.path.getsize(file_path) > 0:
+                print(
+                    f"File for '{text}' with speaker_id {speaker_id} already exists at: {file_path}"
+                )
+                file_paths[speaker_id] = file_path
+                continue
+            else:
+                print(
+                    f"File for '{text}' with speaker_id {speaker_id} is empty, retrying download..."
+                )
+
+        # API setup
+        url = "https://api.zalo.ai/v1/tts/synthesize"
+        headers = {
+            "apikey": ZALO_API_KEY,
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        data = {"input": text, "speaker_id": speaker_id, "encode_type": encode_type}
+
+        # Retry mechanism
+        for attempt in range(5):  # Try up to 5 times
+            response = requests.post(url, headers=headers, data=data)
+
+            if response.status_code == 200:
+                response_data = response.json()
+                if response_data.get("error_code") == 0:
+                    audio_url = response_data["data"]["url"]
+                    audio_response = requests.get(audio_url)
+
+                    # Save audio file
+                    os.makedirs(AUDIO_DIR, exist_ok=True)
+                    with open(file_path, "wb") as f:
+                        f.write(audio_response.content)
+
+                    # Verify if file is downloaded correctly
+                    if os.path.getsize(file_path) > 0:
+                        print(f"Audio saved at: {file_path}")
+                        file_paths[speaker_id] = file_path
+                        save_audio_metadata(text, speaker_id, key)
+                        break  # Exit retry loop if download is successful
+                    else:
+                        print(
+                            f"Download attempt {attempt + 1} failed for '{text}' with speaker_id {speaker_id}. Retrying..."
+                        )
+                        os.remove(file_path)  # Remove the empty file
+                else:
+                    print("Error:", response_data.get("error_message"))
+                    break  # No retry for API response errors
+            else:
+                print(
+                    f"Failed to call API (status code {response.status_code}) on attempt {attempt + 1}. Retrying..."
+                )
+
+    return file_paths
+
+
+def read_words(file_path):
+    words = []
+    with open(file_path, "r", encoding="utf-8") as file:
+        for idx, line in enumerate(file):
+            line = line.strip()
+            if not line:  # Skip empty lines
+                continue
+            try:
+                data = json.loads(line)
+                word = Word(id=idx, text=data["word"], freq=data["frequency"])
+                words.append(word)
+            except Exception as e:
+                print("Exception:", e)
+                print("Line:", line)
+    return words
+
+
+def convert_jsonl_to_tsx():
+    """
+    Convert the Audios.jsonl file to a TypeScript file (Audios.tsx).
+    """
+    audios = []
+
+    # Read from the JSONL file
+    with open(AUDIOS_FILE_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                data = json.loads(line)
+                audios.append(data)
+            except json.JSONDecodeError:
+                continue  # Skip any malformed JSON lines
+
+    # Generate TypeScript content
+    with open(AUDIOS_TSX_FILE_PATH, "w", encoding="utf-8") as tsx_file:
+        tsx_file.write("const Audios = [\n")
+
+        for audio in audios:
+            tsx_file.write("  " + json.dumps(audio, ensure_ascii=False) + ",\n")
+
+        tsx_file.write("];\n\n")
+        tsx_file.write("export default Audios;\n")
+
+    print(f"Audios.tsx file created at: {AUDIOS_TSX_FILE_PATH}")
+
+
+if __name__ == "__main__":
+    # Initialize audio directory and metadata file
+    os.makedirs(AUDIO_DIR, exist_ok=True)
+
+    file_path = "data/VietnameseWords.jsonl"
+    words = read_words(file_path)
+
+    # Get the first 10 words
+    # active_words = words[:10]
+
+    active_words = words
+
+    # Download audio for each word in the first 10 if not already downloaded
+    for word in active_words:
+        print(f"Processing word: {word.text}")
+        tts(word.text, speaker_ids=[1, 2])
+
+    convert_jsonl_to_tsx()
diff --git a/apps/languagesv2/scripts/script_upload_audios_to_server.py b/apps/languagesv2/scripts/script_upload_audios_to_server.py
@@ -0,0 +1,30 @@
+import os
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Define directories and server details
+SRC_DIR = "/workspaces/data/audios"
+HOST_DIR = "/root/data"
+UTS_HOST = os.getenv("UTS_HOST")
+UTS_USERNAME = os.getenv("UTS_USERNAME")
+UTS_PASSWORD = os.getenv("UTS_PASSWORD")
+
+# Check if required environment variables are set
+if not UTS_HOST or not UTS_USERNAME or not UTS_PASSWORD:
+    print(
+        "Error: Missing UTS_HOST, UTS_USERNAME, or UTS_PASSWORD environment variable."
+    )
+    exit(1)
+
+
+# Sync command with sshpass for password-based authentication
+def sync_audio_to_server():
+    command = f'sshpass -p "{UTS_PASSWORD}" rsync -avz {SRC_DIR} {UTS_USERNAME}@{UTS_HOST}:{HOST_DIR}'
+    print(f"Executing command: {command}")
+    os.system(command)  # Executes rsync with specified directories
+
+
+# Run the sync function
+sync_audio_to_server()
-Original file line number
+Diff line change
@@ -1 +1,2 @@
-    data
+    data
+    .env