Skip to content

Commit

Permalink
Add piperTTS in-browser text-to-speech
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat committed Aug 6, 2024
1 parent cc594d4 commit 686c8c1
Show file tree
Hide file tree
Showing 20 changed files with 689 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['558-multi-modal-support'] # put your current branch to create a build. Core team only.
branches: ['pipertts-support'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
**TTS (text-to-speech) support:**

- Native Browser Built-in (default)
- [PiperTTSLocal - runs in browser](https://github.com/rhasspy/piper)
- [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
- [ElevenLabs](https://elevenlabs.io/)

Expand Down
2 changes: 2 additions & 0 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"dependencies": {
"@metamask/jazzicon": "^2.0.0",
"@microsoft/fetch-event-source": "^2.0.1",
"@mintplex-labs/piper-tts-web": "^1.0.0",
"@phosphor-icons/react": "^2.1.7",
"@tremor/react": "^3.15.1",
"dompurify": "^3.0.8",
Expand All @@ -25,6 +26,7 @@
"lodash.debounce": "^4.0.8",
"markdown-it": "^13.0.1",
"moment": "^2.30.1",
"onnxruntime-web": "^1.18.0",
"pluralize": "^8.0.0",
"react": "^18.2.0",
"react-device-detect": "^2.2.2",
Expand Down
Binary file not shown.
Binary file added frontend/public/piper/ort/ort-wasm-simd.wasm
Binary file not shown.
Binary file added frontend/public/piper/ort/ort-wasm.wasm
Binary file not shown.
Binary file added frontend/public/piper/piper_phonemize.data
Binary file not shown.
Binary file added frontend/public/piper/piper_phonemize.wasm
Binary file not shown.
1 change: 0 additions & 1 deletion frontend/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import "react-toastify/dist/ReactToastify.css";
import Login from "@/pages/Login";
import OnboardingFlow from "@/pages/OnboardingFlow";
import i18n from "./i18n";

import { PfpProvider } from "./PfpContext";
import { LogoProvider } from "./LogoContext";
import { FullScreenLoader } from "./components/Preloader";
Expand Down
221 changes: 221 additions & 0 deletions frontend/src/components/TextToSpeech/PiperTTSOptions/index.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import { useState, useEffect, useRef } from "react";
import PiperTTSClient from "@/utils/piperTTS";
import { titleCase } from "text-case";
import { humanFileSize } from "@/utils/numbers";
import showToast from "@/utils/toast";
import { CircleNotch, PauseCircle, PlayCircle } from "@phosphor-icons/react";

export default function PiperTTSOptions({ settings }) {
return (
<>
<p className="text-sm font-base text-white text-opacity-60 mb-4">
All PiperTTS models will run in your browser locally. This can be
resource intensive on lower-end devices.
</p>
<div className="flex gap-x-4 items-center">
<PiperTTSModelSelection settings={settings} />
</div>
</>
);
}

function voicesByLanguage(voices = []) {
const voicesByLanguage = voices.reduce((acc, voice) => {
const langName = voice?.language?.name_english ?? "Unlisted";
acc[langName] = acc[langName] || [];
acc[langName].push(voice);
return acc;
}, {});
return Object.entries(voicesByLanguage);
}

function voiceDisplayName(voice) {
const { is_stored, name, quality, files } = voice;
const onnxFileKey = Object.keys(files).find((key) => key.endsWith(".onnx"));
const fileSize = files?.[onnxFileKey]?.size_bytes || 0;
return `${is_stored ? "✔ " : ""}${titleCase(name)}-${quality === "low" ? "Low" : "HQ"} (${humanFileSize(fileSize)})`;
}

function PiperTTSModelSelection({ settings }) {
const [loading, setLoading] = useState(true);
const [voices, setVoices] = useState([]);
const [selectedVoice, setSelectedVoice] = useState(
settings?.TTSPiperTTSVoiceModel
);

function flushVoices() {
PiperTTSClient.flush()
.then(() =>
showToast("All voices flushed from browser storage", "info", {
clear: true,
})
)
.catch((e) => console.error(e));
}

useEffect(() => {
PiperTTSClient.voices()
.then((voices) => {
if (voices?.length !== 0) return setVoices(voices);
throw new Error("Could not fetch voices from web worker.");
})
.catch((e) => {
console.error(e);
})
.finally(() => setLoading(false));
}, []);

if (loading) {
return (
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
Voice Model Selection
</label>
<select
name="TTSPiperTTSVoiceModel"
disabled={true}
className="border-none bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
>
<option disabled={true} selected={true}>
-- loading available models --
</option>
</select>
</div>
);
}

return (
<div className="flex flex-col w-fit">
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
Voice Model Selection
</label>
<div className="flex items-center w-fit gap-x-4 mb-2">
<select
name="TTSPiperTTSVoiceModel"
required={true}
onChange={(e) => setSelectedVoice(e.target.value)}
value={selectedVoice}
className="border-none flex-shrink-0 bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
>
{voicesByLanguage(voices).map(([lang, voices]) => {
return (
<optgroup key={lang} label={lang}>
{voices.map((voice) => (
<option
selected={voice.key === selectedVoice}
value={voice.key}
>
{voiceDisplayName(voice)}
</option>
))}
</optgroup>
);
})}
</select>
<DemoVoiceSample voiceId={selectedVoice} />
</div>
<p className="text-xs text-white/40">
The "✔" indicates this model is already stored in your browser and
does not need to be downloaded
</p>
</div>
{!!voices.find((voice) => voice.is_stored) && (
<button
type="button"
onClick={flushVoices}
className="w-fit border-none hover:text-white hover:underline text-white/40 text-sm my-4"
>
Flush voice cache
</button>
)}
</div>
);
}

function DemoVoiceSample({ voiceId }) {
const playerRef = useRef(null);
const [speaking, setSpeaking] = useState(false);
const [loading, setLoading] = useState(false);
const [audioSrc, setAudioSrc] = useState(null);

async function speakMessage(e) {
e.preventDefault();
if (speaking) {
playerRef?.current?.pause();
return;
}

try {
if (!audioSrc) {
setLoading(true);
const client = new PiperTTSClient({ voiceId });
const blobUrl = await client.getAudioBlobForText(
"Hello, welcome to AnythingLLM!"
);
setAudioSrc(blobUrl);
setLoading(false);
client.worker?.terminate();
PiperTTSClient._instance = null;
} else {
playerRef.current.play();
}
} catch (e) {
console.error(e);
setLoading(false);
setSpeaking(false);
}
}

useEffect(() => {
function setupPlayer() {
if (!playerRef?.current) return;
playerRef.current.addEventListener("play", () => {
setSpeaking(true);
});

playerRef.current.addEventListener("pause", () => {
playerRef.current.currentTime = 0;
setSpeaking(false);
setAudioSrc(null);
});
}
setupPlayer();
}, []);

return (
<button
type="button"
onClick={speakMessage}
className="border-none text-zinc-300 flex items-center gap-x-1"
>
{speaking ? (
<>
<PauseCircle size={20} className="flex-shrink-0" />
<p className="text-sm flex-shrink-0">Stop demo</p>
</>
) : (
<>
{loading ? (
<>
<CircleNotch size={20} className="animate-spin flex-shrink-0" />
<p className="text-sm flex-shrink-0">Loading voice</p>
</>
) : (
<>
<PlayCircle size={20} className="flex-shrink-0" />
<p className="text-sm flex-shrink-0">Play sample</p>
</>
)}
</>
)}
<audio
ref={playerRef}
hidden={true}
src={audioSrc}
autoPlay={true}
controls={false}
/>
</button>
);
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,38 @@
import { useEffect, useState } from "react";
import NativeTTSMessage from "./native";
import AsyncTTSMessage from "./asyncTts";
import PiperTTSMessage from "./piperTTS";
import System from "@/models/system";

export default function TTSMessage({ slug, chatId, message }) {
const [settings, setSettings] = useState({});
const [provider, setProvider] = useState("native");
const [loading, setLoading] = useState(true);

useEffect(() => {
async function getSettings() {
const _settings = await System.keys();
setProvider(_settings?.TextToSpeechProvider ?? "native");
setSettings(_settings);
setLoading(false);
}
getSettings();
}, []);

if (!chatId || loading) return null;
if (provider !== "native")
return <AsyncTTSMessage slug={slug} chatId={chatId} />;
return <NativeTTSMessage message={message} />;

switch (provider) {
case "openai":
case "elevenlabs":
return <AsyncTTSMessage slug={slug} chatId={chatId} />;
case "piper_local":
return (
<PiperTTSMessage
voiceId={settings?.TTSPiperTTSVoiceModel}
message={message}
/>
);
default:
return <NativeTTSMessage message={message} />;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { useEffect, useState, useRef } from "react";
import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
import { Tooltip } from "react-tooltip";
import PiperTTSClient from "@/utils/piperTTS";

export default function PiperTTS({ voiceId = null, message }) {
const playerRef = useRef(null);
const [speaking, setSpeaking] = useState(false);
const [loading, setLoading] = useState(false);
const [audioSrc, setAudioSrc] = useState(null);

async function speakMessage(e) {
e.preventDefault();
if (speaking) {
playerRef?.current?.pause();
return;
}

try {
if (!audioSrc) {
setLoading(true);
const client = new PiperTTSClient({ voiceId });
const blobUrl = await client.getAudioBlobForText(message);
setAudioSrc(blobUrl);
setLoading(false);
} else {
playerRef.current.play();
}
} catch (e) {
console.error(e);
setLoading(false);
setSpeaking(false);
}
}

useEffect(() => {
function setupPlayer() {
if (!playerRef?.current) return;
playerRef.current.addEventListener("play", () => {
setSpeaking(true);
});

playerRef.current.addEventListener("pause", () => {
playerRef.current.currentTime = 0;
setSpeaking(false);
});
}
setupPlayer();
}, []);

return (
<div className="mt-3 relative">
<button
type="button"
onClick={speakMessage}
data-tooltip-id="message-to-speech"
data-tooltip-content={
speaking ? "Pause TTS speech of message" : "TTS Speak message"
}
className="border-none text-zinc-300"
aria-label={speaking ? "Pause speech" : "Speak message"}
>
{speaking ? (
<PauseCircle size={18} className="mb-1" />
) : (
<>
{loading ? (
<CircleNotch size={18} className="mb-1 animate-spin" />
) : (
<SpeakerHigh size={18} className="mb-1" />
)}
</>
)}
<audio
ref={playerRef}
hidden={true}
src={audioSrc}
autoPlay={true}
controls={false}
/>
</button>
<Tooltip
id="message-to-speech"
place="bottom"
delayShow={300}
className="tooltip !text-xs"
/>
</div>
);
}
Binary file added frontend/src/media/ttsproviders/piper.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 686c8c1

Please sign in to comment.