feat: support edge tts

2025-05-23 22:20:23 +09:00 · 2024-05-27 21:25:25 +08:00 · 2024-05-27 21:25:25 +08:00 · e759631ba3
commit e759631ba3
parent b1e91ca5cd
14 changed files with 537 additions and 56 deletions
--- a/.env.template
+++ b/.env.template
@ -92,3 +92,8 @@ ANTHROPIC_URL=
 ### (optional)
 WHITE_WEBDEV_ENDPOINTS=
 # (optional)
 # Default: zh-CN-YunxiNeural
 # voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
 EDGE_TTS_VOICE_NAME=
--- a/README.md
+++ b/README.md
@ -40,7 +40,10 @@
 - 除插件工具外，与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能)
- 支持 OpenAI TTS（文本转语音）https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
+- 支持 TTS （文本转语音）
  - （免费） Edge TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/266
    - 环境变量（可选）：`EDGE_TTS_VOICE_NAME`
  - （收费） OpenAI TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
 - 支持语音输入，需要使用 HTTPS 访问 https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
@ -291,6 +294,10 @@ anthropic claude Api Url.
 ### `DEFAULT_INPUT_TEMPLATE` （可选）
 自定义默认的 template，用于初始化『设置』中的『用户输入预处理』配置项
 ### `EDGE_TTS_VOICE_NAME` （可选）
 配置 Edge TTS 使用的语音声音，默认为：zh-CN-YunxiNeural
 可访问 https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages 查看支持的参数
 ## 部署
 ### 容器部署 （推荐）
--- a/app/api/config/route.ts
+++ b/app/api/config/route.ts
@ -15,6 +15,7 @@ const DANGER_CONFIG = {
  customModels: serverConfig.customModels,
  isEnableRAG: serverConfig.isEnableRAG,
  defaultModel: serverConfig.defaultModel,
  edgeTTSVoiceName: serverConfig.edgeTTSVoiceName,
 };
 declare global {
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -95,6 +95,7 @@ import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
  DEFAULT_STT_ENGINE,
  DEFAULT_TTS_ENGINE,
  FIREFOX_DEFAULT_STT_ENGINE,
  LAST_INPUT_KEY,
  ModelProvider,
@ -119,6 +120,7 @@ import {
  WebTranscriptionApi,
 } from "../utils/speech";
 import { FileInfo } from "../client/platforms/utils";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 const ttsPlayer = createTTSPlayer();
@ -1086,12 +1088,25 @@ function _Chat() {
      const config = useAppConfig.getState();
      setSpeechLoading(true);
      ttsPlayer.init();
-      const audioBuffer = await api.llm.speech({
+      let audioBuffer: ArrayBuffer;
-        model: config.ttsConfig.model,
+      const { markdownToTxt } = require("markdown-to-txt");
-        input: text,
+      const textContent = markdownToTxt(text);
-        voice: config.ttsConfig.voice,
+      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
-        speed: config.ttsConfig.speed,
+        const edgeVoiceName = accessStore.edgeVoiceName();
-      });
+        const tts = new MsEdgeTTS();
        await tts.setMetadata(
          edgeVoiceName,
          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
        );
        audioBuffer = await tts.toArrayBuffer(textContent);
      } else {
        audioBuffer = await api.llm.speech({
          model: config.ttsConfig.model,
          input: textContent,
          voice: config.ttsConfig.voice,
          speed: config.ttsConfig.speed,
        });
      }
      setSpeechStatus(true);
      ttsPlayer
        .play(audioBuffer, () => {
--- a/app/components/tts-config.tsx
+++ b/app/components/tts-config.tsx
@ -2,7 +2,12 @@ import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
-import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant";
+import {
  DEFAULT_TTS_ENGINE,
  DEFAULT_TTS_ENGINES,
  DEFAULT_TTS_MODELS,
  DEFAULT_TTS_VOICES,
 } from "../constant";
 import { InputRange } from "./input-range";
 export function TTSConfigList(props: {
@ -39,66 +44,89 @@ export function TTSConfigList(props: {
          }
        ></input>
      </ListItem> */}
-      <ListItem title={Locale.Settings.TTS.Model}>
+      <ListItem title={Locale.Settings.TTS.Engine}>
        <Select
-          value={props.ttsConfig.model}
+          value={props.ttsConfig.engine}
          onChange={(e) => {
            props.updateConfig(
              (config) =>
-                (config.model = TTSConfigValidator.model(
+                (config.engine = TTSConfigValidator.engine(
                  e.currentTarget.value,
                )),
            );
          }}
        >
-          {DEFAULT_TTS_MODELS.map((v, i) => (
+          {DEFAULT_TTS_ENGINES.map((v, i) => (
            <option value={v} key={i}>
              {v}
            </option>
          ))}
        </Select>
      </ListItem>
-      <ListItem
+      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
-        title={Locale.Settings.TTS.Voice.Title}
+        <>
-        subTitle={Locale.Settings.TTS.Voice.SubTitle}
+          <ListItem title={Locale.Settings.TTS.Model}>
-      >
+            <Select
-        <Select
+              value={props.ttsConfig.model}
-          value={props.ttsConfig.voice}
+              onChange={(e) => {
-          onChange={(e) => {
+                props.updateConfig(
-            props.updateConfig(
+                  (config) =>
-              (config) =>
+                    (config.model = TTSConfigValidator.model(
-                (config.voice = TTSConfigValidator.voice(
+                      e.currentTarget.value,
-                  e.currentTarget.value,
+                    )),
-                )),
+                );
-            );
+              }}
-          }}
+            >
-        >
+              {DEFAULT_TTS_MODELS.map((v, i) => (
-          {DEFAULT_TTS_VOICES.map((v, i) => (
+                <option value={v} key={i}>
-            <option value={v} key={i}>
+                  {v}
-              {v}
+                </option>
-            </option>
+              ))}
-          ))}
+            </Select>
-        </Select>
+          </ListItem>
-      </ListItem>
+          <ListItem
-      <ListItem
+            title={Locale.Settings.TTS.Voice.Title}
-        title={Locale.Settings.TTS.Speed.Title}
+            subTitle={Locale.Settings.TTS.Voice.SubTitle}
-        subTitle={Locale.Settings.TTS.Speed.SubTitle}
+          >
-      >
+            <Select
-        <InputRange
+              value={props.ttsConfig.voice}
-          value={props.ttsConfig.speed?.toFixed(1)}
+              onChange={(e) => {
-          min="0.3"
+                props.updateConfig(
-          max="4.0"
+                  (config) =>
-          step="0.1"
+                    (config.voice = TTSConfigValidator.voice(
-          onChange={(e) => {
+                      e.currentTarget.value,
-            props.updateConfig(
+                    )),
-              (config) =>
+                );
-                (config.speed = TTSConfigValidator.speed(
+              }}
-                  e.currentTarget.valueAsNumber,
+            >
-                )),
+              {DEFAULT_TTS_VOICES.map((v, i) => (
-            );
+                <option value={v} key={i}>
-          }}
+                  {v}
-        ></InputRange>
+                </option>
-      </ListItem>
+              ))}
            </Select>
          </ListItem>
          <ListItem
            title={Locale.Settings.TTS.Speed.Title}
            subTitle={Locale.Settings.TTS.Speed.SubTitle}
          >
            <InputRange
              value={props.ttsConfig.speed?.toFixed(1)}
              min="0.3"
              max="4.0"
              step="0.1"
              onChange={(e) => {
                props.updateConfig(
                  (config) =>
                    (config.speed = TTSConfigValidator.speed(
                      e.currentTarget.valueAsNumber,
                    )),
                );
              }}
            ></InputRange>
          </ListItem>
        </>
      )}
    </>
  );
 }
--- a/app/config/server.ts
+++ b/app/config/server.ts
@ -151,5 +151,7 @@ export const getServerSideConfig = () => {
    ragChunkOverlap: process.env.RAG_CHUNK_OVERLAP ?? "200",
    ragReturnCount: process.env.RAG_RETURN_COUNT ?? "4",
    allowedWebDevEndpoints,
    edgeTTSVoiceName: process.env.EDGE_TTS_VOICE_NAME ?? "zh-CN-YunxiNeural",
  };
 };
--- a/app/constant.ts
+++ b/app/constant.ts
@ -142,6 +142,8 @@ export const KnowledgeCutOffDate: Record<string, string> = {
  "gemini-pro-vision": "2023-12",
 };
 export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
 export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
 export const DEFAULT_TTS_MODEL = "tts-1";
 export const DEFAULT_TTS_VOICE = "alloy";
 export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@ -399,13 +399,14 @@ const cn = {
    TTS: {
      Enable: {
        Title: "启用文本转语音",
-        SubTitle: "启用基于 OpenAI 的文本生成语音服务",
+        SubTitle: "启用文本生成语音服务",
      },
      Autoplay: {
        Title: "启用自动朗读",
        SubTitle: "自动生成语音并播放，需先开启文本转语音开关",
      },
      Model: "模型",
      Engine: "转换引擎",
      Voice: {
        Title: "声音",
        SubTitle: "生成语音时使用的声音",
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@ -405,7 +405,7 @@ const en: LocaleType = {
    TTS: {
      Enable: {
        Title: "Enable TTS",
-        SubTitle: "Enable text-to-speech service based on OpenAI",
+        SubTitle: "Enable text-to-speech service",
      },
      Autoplay: {
        Title: "Enable Autoplay",
@ -421,6 +421,7 @@ const en: LocaleType = {
        Title: "Speed",
        SubTitle: "The speed of the generated audio",
      },
      Engine: "TTS Engine",
    },
    STT: {
      Enable: {
--- a/app/store/access.ts
+++ b/app/store/access.ts
@ -51,6 +51,9 @@ const DEFAULT_ACCESS_STATE = {
  customModels: "",
  isEnableRAG: false,
  defaultModel: "",
  // tts config
  edgeTTSVoiceName: "zh-CN-YunxiNeural",
 };
 export const useAccessStore = createPersistStore(
@ -63,6 +66,12 @@ export const useAccessStore = createPersistStore(
      return get().needCode;
    },
    edgeVoiceName() {
      this.fetch();
      return get().edgeTTSVoiceName;
    },
    enableRAG() {
      this.fetch();
--- a/app/store/config.ts
+++ b/app/store/config.ts
@ -6,6 +6,8 @@ import {
  DEFAULT_SIDEBAR_WIDTH,
  DEFAULT_STT_ENGINE,
  DEFAULT_STT_ENGINES,
  DEFAULT_TTS_ENGINE,
  DEFAULT_TTS_ENGINES,
  DEFAULT_TTS_MODEL,
  DEFAULT_TTS_MODELS,
  DEFAULT_TTS_VOICE,
@ -17,6 +19,7 @@ import { createPersistStore } from "../utils/store";
 export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
 export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
 export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
@ -79,6 +82,7 @@ export const DEFAULT_CONFIG = {
  ttsConfig: {
    enable: false,
    autoplay: false,
    engine: DEFAULT_TTS_ENGINE,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
@ -111,6 +115,9 @@ export function limitNumber(
 }
 export const TTSConfigValidator = {
  engine(x: string) {
    return x as TTSEngineType;
  },
  model(x: string) {
    return x as TTSModelType;
  },
--- a/app/utils/ms_edge_tts.ts
+++ b/app/utils/ms_edge_tts.ts
@ -0,0 +1,378 @@
 import axios from "axios";
 import { Buffer } from "buffer";
 import { randomBytes } from "crypto";
 import { Readable } from "stream";
 // Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
 /**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
 */
 export enum VOLUME {
  SILENT = "silent",
  X_SOFT = "x-soft",
  SOFT = "soft",
  MEDIUM = "medium",
  LOUD = "loud",
  X_LOUD = "x-LOUD",
  DEFAULT = "default",
 }
 /**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
 */
 export enum RATE {
  X_SLOW = "x-slow",
  SLOW = "slow",
  MEDIUM = "medium",
  FAST = "fast",
  X_FAST = "x-fast",
  DEFAULT = "default",
 }
 /**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
 */
 export enum PITCH {
  X_LOW = "x-low",
  LOW = "low",
  MEDIUM = "medium",
  HIGH = "high",
  X_HIGH = "x-high",
  DEFAULT = "default",
 }
 /**
 * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
 */
 export enum OUTPUT_FORMAT {
  // Streaming =============================
  // AMR_WB_16000HZ = "amr-wb-16000hz",
  // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
  // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
  // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
  // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
  // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
  // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
  AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
  AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
  // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
  // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
  // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
  // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
  // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
  // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
  // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
  // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
  // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
  // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
  // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
  // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
  // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
  // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
  // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
  // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
  // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
  // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
  WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
  // Non-streaming =============================
  // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
  // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
  // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
  // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
  // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
  // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
  // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
 }
 export type Voice = {
  Name: string;
  ShortName: string;
  Gender: string;
  Locale: string;
  SuggestedCodec: string;
  FriendlyName: string;
  Status: string;
 };
 export class ProsodyOptions {
  /**
   * The pitch to use.
   * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
   */
  pitch?: PITCH | string = "+0Hz";
  /**
   * The rate to use.
   * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
   */
  rate?: RATE | string | number = 1.0;
  /**
   * The volume to use.
   * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
   */
  volume?: VOLUME | string | number = 100.0;
 }
 export class MsEdgeTTS {
  static OUTPUT_FORMAT = OUTPUT_FORMAT;
  private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
  private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  private static BINARY_DELIM = "Path:audio\r\n";
  private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
  private readonly _enableLogger;
  private _ws: WebSocket | undefined;
  private _voice: any;
  private _voiceLocale: any;
  private _outputFormat: any;
  private _streams: { [key: string]: Readable } = {};
  private _startTime = 0;
  private _log(...o: any[]) {
    if (this._enableLogger) {
      console.log(...o);
    }
  }
  /**
   * Create a new `MsEdgeTTS` instance.
   *
   * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
   * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
   */
  public constructor(enableLogger: boolean = false) {
    this._enableLogger = enableLogger;
  }
  private async _send(message: any) {
    for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
      if (i == 1) {
        this._startTime = Date.now();
      }
      this._log("connecting: ", i);
      await this._initClient();
    }
    this._ws!.send(message);
  }
  private _initClient() {
    this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
    this._ws.binaryType = "arraybuffer";
    return new Promise((resolve, reject) => {
      this._ws!.onopen = () => {
        this._log(
          "Connected in",
          (Date.now() - this._startTime) / 1000,
          "seconds",
        );
        this._send(
          `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
                    {
                        "context": {
                            "synthesis": {
                                "audio": {
                                    "metadataoptions": {
                                        "sentenceBoundaryEnabled": "false",
                                        "wordBoundaryEnabled": "false"
                                    },
                                    "outputFormat": "${this._outputFormat}" 
                                }
                            }
                        }
                    }
                `,
        ).then(resolve);
      };
      this._ws!.onmessage = (m: any) => {
        const buffer = Buffer.from(m.data as ArrayBuffer);
        const message = buffer.toString();
        const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
        if (message.includes("Path:turn.start")) {
          // start of turn, ignore
        } else if (message.includes("Path:turn.end")) {
          // end of turn, close stream
          this._streams[requestId].push(null);
        } else if (message.includes("Path:response")) {
          // context response, ignore
        } else if (
          message.includes("Path:audio") &&
          m.data instanceof ArrayBuffer
        ) {
          this._pushAudioData(buffer, requestId);
        } else {
          this._log("UNKNOWN MESSAGE", message);
        }
      };
      this._ws!.onclose = () => {
        this._log(
          "disconnected after:",
          (Date.now() - this._startTime) / 1000,
          "seconds",
        );
        for (const requestId in this._streams) {
          this._streams[requestId].push(null);
        }
      };
      this._ws!.onerror = function (error: any) {
        reject("Connect Error: " + error);
      };
    });
  }
  private _pushAudioData(audioBuffer: Buffer, requestId: string) {
    const audioStartIndex =
      audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
      MsEdgeTTS.BINARY_DELIM.length;
    const audioData = audioBuffer.subarray(audioStartIndex);
    this._streams[requestId].push(audioData);
    this._log("received audio chunk, size: ", audioData?.length);
  }
  private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
    // in case future updates to the edge API block these elements, we'll be concatenating strings.
    options = { ...new ProsodyOptions(), ...options };
    return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
                <voice name="${this._voice}">
                    <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
                        ${input}
                    </prosody> 
                </voice>
            </speak>`;
  }
  /**
   * Fetch the list of voices available in Microsoft Edge.
   * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
   */
  getVoices(): Promise<Voice[]> {
    return new Promise((resolve, reject) => {
      axios
        .get(MsEdgeTTS.VOICES_URL)
        .then((res) => resolve(res.data))
        .catch(reject);
    });
  }
  /**
   * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
   * Must be called at least once before text can be synthesised.
   * Saved in this instance. Can be called at any time times to update the metadata.
   *
   * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
   * @param outputFormat any {@link OUTPUT_FORMAT}
   * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
   */
  async setMetadata(
    voiceName: string,
    outputFormat: OUTPUT_FORMAT,
    voiceLocale?: string,
  ) {
    const oldVoice = this._voice;
    const oldVoiceLocale = this._voiceLocale;
    const oldOutputFormat = this._outputFormat;
    this._voice = voiceName;
    this._voiceLocale = voiceLocale;
    if (!this._voiceLocale) {
      const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
      if (!voiceLangMatch)
        throw new Error("Could not infer voiceLocale from voiceName!");
      this._voiceLocale = voiceLangMatch[0];
    }
    this._outputFormat = outputFormat;
    const changed =
      oldVoice !== this._voice ||
      oldVoiceLocale !== this._voiceLocale ||
      oldOutputFormat !== this._outputFormat;
    // create new client
    if (changed || this._ws!.readyState !== this._ws!.OPEN) {
      this._startTime = Date.now();
      await this._initClient();
    }
  }
  private _metadataCheck() {
    if (!this._ws)
      throw new Error(
        "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
      );
  }
  /**
   * Close the WebSocket connection.
   */
  close() {
    this._ws!.close();
  }
  /**
   * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
   *
   * @param input the text to synthesise. Can include SSML elements.
   * @param options (optional) {@link ProsodyOptions}
   * @returns {Readable} - a `stream.Readable` with the audio data
   */
  toStream(input: string, options?: ProsodyOptions): Readable {
    const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
    return stream;
  }
  toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
    return new Promise((resolve, reject) => {
      let data: Uint8Array[] = [];
      const readable = this.toStream(input, options);
      readable.on("data", (chunk) => {
        data.push(chunk);
      });
      readable.on("end", () => {
        resolve(Buffer.concat(data).buffer);
      });
      readable.on("error", (err) => {
        reject(err);
      });
    });
  }
  /**
   * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
   *
   * @param requestSSML the SSML to send. SSML elements required in order to work.
   * @returns {Readable} - a `stream.Readable` with the audio data
   */
  rawToStream(requestSSML: string): Readable {
    const { stream } = this._rawSSMLRequest(requestSSML);
    return stream;
  }
  private _rawSSMLRequest(requestSSML: string): {
    stream: Readable;
    requestId: string;
  } {
    this._metadataCheck();
    const requestId = randomBytes(16).toString("hex");
    const request =
      `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
                ` + requestSSML.trim();
    // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
    const self = this;
    const stream = new Readable({
      read() {},
      destroy(error: Error | null, callback: (error: Error | null) => void) {
        delete self._streams[requestId];
        callback(error);
      },
    });
    this._streams[requestId] = stream;
    this._send(request).then();
    return { stream, requestId };
  }
 }
--- a/package.json
+++ b/package.json
@ -38,13 +38,14 @@
    "encoding": "^0.1.13",
    "epub2": "^3.0.2",
    "fuse.js": "^7.0.0",
    "html-entities": "^2.4.0",
    "heic2any": "^0.0.4",
    "html-entities": "^2.4.0",
    "html-to-image": "^1.11.11",
    "html-to-text": "^9.0.5",
    "https-proxy-agent": "^7.0.2",
    "langchain": "0.1.37",
    "mammoth": "^1.7.1",
    "markdown-to-txt": "^2.0.1",
    "md5": "^2.3.0",
    "mermaid": "^10.6.1",
    "mime": "^4.0.1",
--- a/yarn.lock
+++ b/yarn.lock
@ -6231,11 +6231,21 @@ lodash.debounce@^4.0.8:
  resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz"
  integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==
 lodash.escape@^4.0.1:
  version "4.0.1"
  resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98"
  integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==
 lodash.merge@^4.6.2:
  version "4.6.2"
  resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz"
  integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==
 lodash.unescape@^4.0.1:
  version "4.0.1"
  resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c"
  integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg==
 lodash@^4.17.21:
  version "4.17.21"
  resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz"
@ -6324,6 +6334,20 @@ markdown-table@^3.0.0:
  resolved "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz"
  integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==
 markdown-to-txt@^2.0.1:
  version "2.0.1"
  resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5"
  integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw==
  dependencies:
    lodash.escape "^4.0.1"
    lodash.unescape "^4.0.1"
    marked "^4.0.14"
 marked@^4.0.14:
  version "4.3.0"
  resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
  integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
 md5@^2.3.0:
  version "2.3.0"
  resolved "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz"