Add piperTTS in-browser text-to-speech

Mintplex-Labs · Aug 6, 2024 · 686c8c1 · 686c8c1
1 parent cc594d4
commit 686c8c1
Show file tree

Hide file tree

Showing 20 changed files with 689 additions and 7 deletions.
diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['558-multi-modal-support'] # put your current branch to create a build. Core team only.
+    branches: ['pipertts-support'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'

diff --git a/README.md b/README.md
@@ -110,6 +110,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
 **TTS (text-to-speech) support:**
 
 - Native Browser Built-in (default)
+- [PiperTTSLocal - runs in browser](https://github.com/rhasspy/piper)
 - [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
 - [ElevenLabs](https://elevenlabs.io/)
 

diff --git a/frontend/package.json b/frontend/package.json
@@ -13,6 +13,7 @@
   "dependencies": {
     "@metamask/jazzicon": "^2.0.0",
     "@microsoft/fetch-event-source": "^2.0.1",
+    "@mintplex-labs/piper-tts-web": "^1.0.0",
     "@phosphor-icons/react": "^2.1.7",
     "@tremor/react": "^3.15.1",
     "dompurify": "^3.0.8",
@@ -25,6 +26,7 @@
     "lodash.debounce": "^4.0.8",
     "markdown-it": "^13.0.1",
     "moment": "^2.30.1",
+    "onnxruntime-web": "^1.18.0",
     "pluralize": "^8.0.0",
     "react": "^18.2.0",
     "react-device-detect": "^2.2.2",

diff --git a/frontend/public/piper/ort/ort-wasm-simd-threaded.wasm b/frontend/public/piper/ort/ort-wasm-simd-threaded.wasm
diff --git a/frontend/public/piper/ort/ort-wasm-simd.wasm b/frontend/public/piper/ort/ort-wasm-simd.wasm
diff --git a/frontend/public/piper/ort/ort-wasm.wasm b/frontend/public/piper/ort/ort-wasm.wasm
diff --git a/frontend/public/piper/piper_phonemize.data b/frontend/public/piper/piper_phonemize.data
diff --git a/frontend/public/piper/piper_phonemize.wasm b/frontend/public/piper/piper_phonemize.wasm
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
@@ -11,7 +11,6 @@ import "react-toastify/dist/ReactToastify.css";
 import Login from "@/pages/Login";
 import OnboardingFlow from "@/pages/OnboardingFlow";
 import i18n from "./i18n";
-
 import { PfpProvider } from "./PfpContext";
 import { LogoProvider } from "./LogoContext";
 import { FullScreenLoader } from "./components/Preloader";

diff --git a/frontend/src/components/TextToSpeech/PiperTTSOptions/index.jsx b/frontend/src/components/TextToSpeech/PiperTTSOptions/index.jsx
@@ -0,0 +1,221 @@
+import { useState, useEffect, useRef } from "react";
+import PiperTTSClient from "@/utils/piperTTS";
+import { titleCase } from "text-case";
+import { humanFileSize } from "@/utils/numbers";
+import showToast from "@/utils/toast";
+import { CircleNotch, PauseCircle, PlayCircle } from "@phosphor-icons/react";
+
+export default function PiperTTSOptions({ settings }) {
+  return (
+    <>
+      <p className="text-sm font-base text-white text-opacity-60 mb-4">
+        All PiperTTS models will run in your browser locally. This can be
+        resource intensive on lower-end devices.
+      </p>
+      <div className="flex gap-x-4 items-center">
+        <PiperTTSModelSelection settings={settings} />
+      </div>
+    </>
+  );
+}
+
+function voicesByLanguage(voices = []) {
+  const voicesByLanguage = voices.reduce((acc, voice) => {
+    const langName = voice?.language?.name_english ?? "Unlisted";
+    acc[langName] = acc[langName] || [];
+    acc[langName].push(voice);
+    return acc;
+  }, {});
+  return Object.entries(voicesByLanguage);
+}
+
+function voiceDisplayName(voice) {
+  const { is_stored, name, quality, files } = voice;
+  const onnxFileKey = Object.keys(files).find((key) => key.endsWith(".onnx"));
+  const fileSize = files?.[onnxFileKey]?.size_bytes || 0;
+  return `${is_stored ? "✔ " : ""}${titleCase(name)}-${quality === "low" ? "Low" : "HQ"} (${humanFileSize(fileSize)})`;
+}
+
+function PiperTTSModelSelection({ settings }) {
+  const [loading, setLoading] = useState(true);
+  const [voices, setVoices] = useState([]);
+  const [selectedVoice, setSelectedVoice] = useState(
+    settings?.TTSPiperTTSVoiceModel
+  );
+
+  function flushVoices() {
+    PiperTTSClient.flush()
+      .then(() =>
+        showToast("All voices flushed from browser storage", "info", {
+          clear: true,
+        })
+      )
+      .catch((e) => console.error(e));
+  }
+
+  useEffect(() => {
+    PiperTTSClient.voices()
+      .then((voices) => {
+        if (voices?.length !== 0) return setVoices(voices);
+        throw new Error("Could not fetch voices from web worker.");
+      })
+      .catch((e) => {
+        console.error(e);
+      })
+      .finally(() => setLoading(false));
+  }, []);
+
+  if (loading) {
+    return (
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-3">
+          Voice Model Selection
+        </label>
+        <select
+          name="TTSPiperTTSVoiceModel"
+          disabled={true}
+          className="border-none bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          <option disabled={true} selected={true}>
+            -- loading available models --
+          </option>
+        </select>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex flex-col w-fit">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-3">
+          Voice Model Selection
+        </label>
+        <div className="flex items-center w-fit gap-x-4 mb-2">
+          <select
+            name="TTSPiperTTSVoiceModel"
+            required={true}
+            onChange={(e) => setSelectedVoice(e.target.value)}
+            value={selectedVoice}
+            className="border-none flex-shrink-0 bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+          >
+            {voicesByLanguage(voices).map(([lang, voices]) => {
+              return (
+                <optgroup key={lang} label={lang}>
+                  {voices.map((voice) => (
+                    <option
+                      selected={voice.key === selectedVoice}
+                      value={voice.key}
+                    >
+                      {voiceDisplayName(voice)}
+                    </option>
+                  ))}
+                </optgroup>
+              );
+            })}
+          </select>
+          <DemoVoiceSample voiceId={selectedVoice} />
+        </div>
+        <p className="text-xs text-white/40">
+          The "✔" indicates this model is already stored in your browser and
+          does not need to be downloaded
+        </p>
+      </div>
+      {!!voices.find((voice) => voice.is_stored) && (
+        <button
+          type="button"
+          onClick={flushVoices}
+          className="w-fit border-none hover:text-white hover:underline text-white/40 text-sm my-4"
+        >
+          Flush voice cache
+        </button>
+      )}
+    </div>
+  );
+}
+
+function DemoVoiceSample({ voiceId }) {
+  const playerRef = useRef(null);
+  const [speaking, setSpeaking] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [audioSrc, setAudioSrc] = useState(null);
+
+  async function speakMessage(e) {
+    e.preventDefault();
+    if (speaking) {
+      playerRef?.current?.pause();
+      return;
+    }
+
+    try {
+      if (!audioSrc) {
+        setLoading(true);
+        const client = new PiperTTSClient({ voiceId });
+        const blobUrl = await client.getAudioBlobForText(
+          "Hello, welcome to AnythingLLM!"
+        );
+        setAudioSrc(blobUrl);
+        setLoading(false);
+        client.worker?.terminate();
+        PiperTTSClient._instance = null;
+      } else {
+        playerRef.current.play();
+      }
+    } catch (e) {
+      console.error(e);
+      setLoading(false);
+      setSpeaking(false);
+    }
+  }
+
+  useEffect(() => {
+    function setupPlayer() {
+      if (!playerRef?.current) return;
+      playerRef.current.addEventListener("play", () => {
+        setSpeaking(true);
+      });
+
+      playerRef.current.addEventListener("pause", () => {
+        playerRef.current.currentTime = 0;
+        setSpeaking(false);
+        setAudioSrc(null);
+      });
+    }
+    setupPlayer();
+  }, []);
+
+  return (
+    <button
+      type="button"
+      onClick={speakMessage}
+      className="border-none text-zinc-300 flex items-center gap-x-1"
+    >
+      {speaking ? (
+        <>
+          <PauseCircle size={20} className="flex-shrink-0" />
+          <p className="text-sm flex-shrink-0">Stop demo</p>
+        </>
+      ) : (
+        <>
+          {loading ? (
+            <>
+              <CircleNotch size={20} className="animate-spin flex-shrink-0" />
+              <p className="text-sm flex-shrink-0">Loading voice</p>
+            </>
+          ) : (
+            <>
+              <PlayCircle size={20} className="flex-shrink-0" />
+              <p className="text-sm flex-shrink-0">Play sample</p>
+            </>
+          )}
+        </>
+      )}
+      <audio
+        ref={playerRef}
+        hidden={true}
+        src={audioSrc}
+        autoPlay={true}
+        controls={false}
+      />
+    </button>
+  );
+}
diff --git a/...nts/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/...nts/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@@ -1,23 +1,38 @@
 import { useEffect, useState } from "react";
 import NativeTTSMessage from "./native";
 import AsyncTTSMessage from "./asyncTts";
+import PiperTTSMessage from "./piperTTS";
 import System from "@/models/system";
 
 export default function TTSMessage({ slug, chatId, message }) {
+  const [settings, setSettings] = useState({});
   const [provider, setProvider] = useState("native");
   const [loading, setLoading] = useState(true);
 
   useEffect(() => {
     async function getSettings() {
       const _settings = await System.keys();
       setProvider(_settings?.TextToSpeechProvider ?? "native");
+      setSettings(_settings);
       setLoading(false);
     }
     getSettings();
   }, []);
 
   if (!chatId || loading) return null;
-  if (provider !== "native")
-    return <AsyncTTSMessage slug={slug} chatId={chatId} />;
-  return <NativeTTSMessage message={message} />;
+
+  switch (provider) {
+    case "openai":
+    case "elevenlabs":
+      return <AsyncTTSMessage slug={slug} chatId={chatId} />;
+    case "piper_local":
+      return (
+        <PiperTTSMessage
+          voiceId={settings?.TTSPiperTTSVoiceModel}
+          message={message}
+        />
+      );
+    default:
+      return <NativeTTSMessage message={message} />;
+  }
 }
diff --git a/.../WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/piperTTS.jsx b/.../WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/piperTTS.jsx
@@ -0,0 +1,90 @@
+import { useEffect, useState, useRef } from "react";
+import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+import PiperTTSClient from "@/utils/piperTTS";
+
+export default function PiperTTS({ voiceId = null, message }) {
+  const playerRef = useRef(null);
+  const [speaking, setSpeaking] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [audioSrc, setAudioSrc] = useState(null);
+
+  async function speakMessage(e) {
+    e.preventDefault();
+    if (speaking) {
+      playerRef?.current?.pause();
+      return;
+    }
+
+    try {
+      if (!audioSrc) {
+        setLoading(true);
+        const client = new PiperTTSClient({ voiceId });
+        const blobUrl = await client.getAudioBlobForText(message);
+        setAudioSrc(blobUrl);
+        setLoading(false);
+      } else {
+        playerRef.current.play();
+      }
+    } catch (e) {
+      console.error(e);
+      setLoading(false);
+      setSpeaking(false);
+    }
+  }
+
+  useEffect(() => {
+    function setupPlayer() {
+      if (!playerRef?.current) return;
+      playerRef.current.addEventListener("play", () => {
+        setSpeaking(true);
+      });
+
+      playerRef.current.addEventListener("pause", () => {
+        playerRef.current.currentTime = 0;
+        setSpeaking(false);
+      });
+    }
+    setupPlayer();
+  }, []);
+
+  return (
+    <div className="mt-3 relative">
+      <button
+        type="button"
+        onClick={speakMessage}
+        data-tooltip-id="message-to-speech"
+        data-tooltip-content={
+          speaking ? "Pause TTS speech of message" : "TTS Speak message"
+        }
+        className="border-none text-zinc-300"
+        aria-label={speaking ? "Pause speech" : "Speak message"}
+      >
+        {speaking ? (
+          <PauseCircle size={18} className="mb-1" />
+        ) : (
+          <>
+            {loading ? (
+              <CircleNotch size={18} className="mb-1 animate-spin" />
+            ) : (
+              <SpeakerHigh size={18} className="mb-1" />
+            )}
+          </>
+        )}
+        <audio
+          ref={playerRef}
+          hidden={true}
+          src={audioSrc}
+          autoPlay={true}
+          controls={false}
+        />
+      </button>
+      <Tooltip
+        id="message-to-speech"
+        place="bottom"
+        delayShow={300}
+        className="tooltip !text-xs"
+      />
+    </div>
+  );
+}
diff --git a/frontend/src/media/ttsproviders/piper.png b/frontend/src/media/ttsproviders/piper.png