Skip to content

[Bug] When I generate a TTS model and play it, I only hear noise it awkward fumbling. #4308

@ghanshyamsen

Description

@ghanshyamsen

Describe the bug

I use tts model xtts v2 when I generate Hindi voice and hear audio it fumbling like a horror noise and voice cloning note matched.

note also i am using chunk for long text but this issue comes in also short tags

To Reproduce

`# Set license agreement for Coqui
os.environ["COQUI_TOS_AGREED"] = "1"

--- Setup ---

try:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS
print("Initializing TTS model...")
# Using the same model as the original script
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
print("TTS model loaded successfully.")
# Language options
supported_languages = sorted(tts.languages)
default_language = "hi"
except Exception as e:
print(f"Error initializing TTS model: {e}")
tts = None
supported_languages = ["en", "hi"]
default_language = "hi"

Create a default speaker file if it doesn't exist

default_voice_path = "default_speaker.wav"
if not os.path.exists(default_voice_path):
print(f"'{default_voice_path}' not found. Creating a silent placeholder.")
AudioSegment.silent(duration=1000).export(default_voice_path, format="wav")`

`
def synthesize(text, speaker_wav, bg_music, language, temperature, repetition_penalty, speed, progress=gr.Progress(track_tqdm=True)):
if tts is None:
gr.Warning("TTS Model is not available. Please check the console for errors.")
return None

speaker_path = speaker_wav if speaker_wav else default_voice_path
if not os.path.exists(speaker_path):
    gr.Warning(f"Speaker file not found at '{speaker_path}'. Using default.")
    speaker_path = default_voice_path

# Step 1: Split text and filter out any empty chunks
text_chunks = split_text_into_chunks(text)
if not text_chunks:
    gr.Info("Please enter some text to synthesize.")
    return None

chunk_paths = []

try:
    # Step 2: Generate each chunk
    print("Generating audio for chunks...")
    for i, chunk in enumerate(progress.tqdm(text_chunks, desc="Generating Chunks")):
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            chunk_path = fp.name
        
        print(f"  - Chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
        
        ### IMPROVEMENT 1: Added key inference parameters to the TTS call.
        # These are the most important parameters from the original script for quality.
        tts.tts_to_file(
            text=chunk,
            speaker_wav=speaker_path,
            language=language,
            file_path=chunk_path,
            temperature=float(temperature), 
            repetition_penalty=float(repetition_penalty),
            speed=float(speed)
        )
        chunk_paths.append(chunk_path)

    # Step 3: Combine chunks with pauses
    print("Combining audio chunks...")
    final_voice = AudioSegment.empty()
    
    ### IMPROVEMENT 2: Add a short, natural pause between sentences.
    # This makes the stitched-together audio flow much better.
    pause_duration_ms = 400 # 0.4-second pause
    
    for i, path in enumerate(chunk_paths):
        chunk_audio = AudioSegment.from_file(path)
        final_voice += chunk_audio
        # Add a pause after each chunk except the last one
        if i < len(chunk_paths) - 1:
            final_voice += AudioSegment.silent(duration=pause_duration_ms)

    # Step 4: Handle background music
    if bg_music and os.path.exists(bg_music):
        print("Mixing background music...")
        try:
            music = AudioSegment.from_file(bg_music)
            # Lower background volume significantly to not overpower the voice
            music = music - 20 

            # Loop or trim music to match voice length
            if len(music) < len(final_voice):
                music = music * (len(final_voice) // len(music) + 1)
            music = music[:len(final_voice)]

            # Add fade in and fade out for a professional feel
            music = music.fade_in(1500).fade_out(3000)
            final_output_audio = final_voice.overlay(music, position=0)
        except Exception as e:
            gr.Error(f"Error mixing background music: {e}")
            return None
    else:
        final_output_audio = final_voice
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        final_output_path = fp.name
    final_output_audio.export(final_output_path, format="wav")
    print(f"Final audio saved to: {final_output_path}")
    
    return final_output_path

except Exception as e:
    gr.Error(f"An unexpected error occurred during TTS generation: {e}")
    return None
finally:
    # Step 5: Cleanup chunk files
    print("Cleaning up temporary chunk files...")
    for path in chunk_paths:
        if os.path.exists(path):
            os.remove(path)

`

Expected behavior

No response

Logs

Environment

TTS==0.22.0
torch==2.1
pydub==0.25.1

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions