-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Description
Describe the bug
I use tts model xtts v2 when I generate Hindi voice and hear audio it fumbling like a horror noise and voice cloning note matched.
note also i am using chunk for long text but this issue comes in also short tags
To Reproduce
`# Set license agreement for Coqui
os.environ["COQUI_TOS_AGREED"] = "1"
--- Setup ---
try:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS
print("Initializing TTS model...")
# Using the same model as the original script
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
print("TTS model loaded successfully.")
# Language options
supported_languages = sorted(tts.languages)
default_language = "hi"
except Exception as e:
print(f"Error initializing TTS model: {e}")
tts = None
supported_languages = ["en", "hi"]
default_language = "hi"
Create a default speaker file if it doesn't exist
default_voice_path = "default_speaker.wav"
if not os.path.exists(default_voice_path):
print(f"'{default_voice_path}' not found. Creating a silent placeholder.")
AudioSegment.silent(duration=1000).export(default_voice_path, format="wav")`
`
def synthesize(text, speaker_wav, bg_music, language, temperature, repetition_penalty, speed, progress=gr.Progress(track_tqdm=True)):
if tts is None:
gr.Warning("TTS Model is not available. Please check the console for errors.")
return None
speaker_path = speaker_wav if speaker_wav else default_voice_path
if not os.path.exists(speaker_path):
gr.Warning(f"Speaker file not found at '{speaker_path}'. Using default.")
speaker_path = default_voice_path
# Step 1: Split text and filter out any empty chunks
text_chunks = split_text_into_chunks(text)
if not text_chunks:
gr.Info("Please enter some text to synthesize.")
return None
chunk_paths = []
try:
# Step 2: Generate each chunk
print("Generating audio for chunks...")
for i, chunk in enumerate(progress.tqdm(text_chunks, desc="Generating Chunks")):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
chunk_path = fp.name
print(f" - Chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
### IMPROVEMENT 1: Added key inference parameters to the TTS call.
# These are the most important parameters from the original script for quality.
tts.tts_to_file(
text=chunk,
speaker_wav=speaker_path,
language=language,
file_path=chunk_path,
temperature=float(temperature),
repetition_penalty=float(repetition_penalty),
speed=float(speed)
)
chunk_paths.append(chunk_path)
# Step 3: Combine chunks with pauses
print("Combining audio chunks...")
final_voice = AudioSegment.empty()
### IMPROVEMENT 2: Add a short, natural pause between sentences.
# This makes the stitched-together audio flow much better.
pause_duration_ms = 400 # 0.4-second pause
for i, path in enumerate(chunk_paths):
chunk_audio = AudioSegment.from_file(path)
final_voice += chunk_audio
# Add a pause after each chunk except the last one
if i < len(chunk_paths) - 1:
final_voice += AudioSegment.silent(duration=pause_duration_ms)
# Step 4: Handle background music
if bg_music and os.path.exists(bg_music):
print("Mixing background music...")
try:
music = AudioSegment.from_file(bg_music)
# Lower background volume significantly to not overpower the voice
music = music - 20
# Loop or trim music to match voice length
if len(music) < len(final_voice):
music = music * (len(final_voice) // len(music) + 1)
music = music[:len(final_voice)]
# Add fade in and fade out for a professional feel
music = music.fade_in(1500).fade_out(3000)
final_output_audio = final_voice.overlay(music, position=0)
except Exception as e:
gr.Error(f"Error mixing background music: {e}")
return None
else:
final_output_audio = final_voice
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
final_output_path = fp.name
final_output_audio.export(final_output_path, format="wav")
print(f"Final audio saved to: {final_output_path}")
return final_output_path
except Exception as e:
gr.Error(f"An unexpected error occurred during TTS generation: {e}")
return None
finally:
# Step 5: Cleanup chunk files
print("Cleaning up temporary chunk files...")
for path in chunk_paths:
if os.path.exists(path):
os.remove(path)
`
Expected behavior
No response
Logs
Environment
TTS==0.22.0
torch==2.1
pydub==0.25.1
Additional context
No response