feat: support edge tts

2025-05-23 14:10:18 +09:00 · 2024-05-27 21:25:25 +08:00 · 2024-05-27 21:25:25 +08:00 · e759631ba3
commit e759631ba3
parent b1e91ca5cd
14 changed files with 537 additions and 56 deletions
--- a/.env.template
+++ b/.env.template
@ -92,3 +92,8 @@ ANTHROPIC_URL=

 ### (optional)
 WHITE_WEBDEV_ENDPOINTS=
+
+# (optional)
+# Default: zh-CN-YunxiNeural
+# voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
+EDGE_TTS_VOICE_NAME=
--- a/README.md
+++ b/README.md
@ -40,7 +40,10 @@

 - 除插件工具外，与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能)

- 支持 OpenAI TTS（文本转语音）https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
+- 支持 TTS （文本转语音）
+  - （免费） Edge TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/266
+    - 环境变量（可选）：`EDGE_TTS_VOICE_NAME`
+  - （收费） OpenAI TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208

 - 支持语音输入，需要使用 HTTPS 访问 https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208

@ -291,6 +294,10 @@ anthropic claude Api Url.
 ### `DEFAULT_INPUT_TEMPLATE` （可选）
 自定义默认的 template，用于初始化『设置』中的『用户输入预处理』配置项

+### `EDGE_TTS_VOICE_NAME` （可选）
+配置 Edge TTS 使用的语音声音，默认为：zh-CN-YunxiNeural
+可访问 https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages 查看支持的参数
+
 ## 部署

 ### 容器部署 （推荐）
--- a/app/api/config/route.ts
+++ b/app/api/config/route.ts
@ -15,6 +15,7 @@ const DANGER_CONFIG = {
  customModels: serverConfig.customModels,
  isEnableRAG: serverConfig.isEnableRAG,
  defaultModel: serverConfig.defaultModel,
+  edgeTTSVoiceName: serverConfig.edgeTTSVoiceName,
 };

 declare global {
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -95,6 +95,7 @@ import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
  DEFAULT_STT_ENGINE,
+  DEFAULT_TTS_ENGINE,
  FIREFOX_DEFAULT_STT_ENGINE,
  LAST_INPUT_KEY,
  ModelProvider,
@ -119,6 +120,7 @@ import {
  WebTranscriptionApi,
 } from "../utils/speech";
 import { FileInfo } from "../client/platforms/utils";
+import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";

 const ttsPlayer = createTTSPlayer();

@ -1086,12 +1088,25 @@ function _Chat() {
      const config = useAppConfig.getState();
      setSpeechLoading(true);
      ttsPlayer.init();
-      const audioBuffer = await api.llm.speech({
-        model: config.ttsConfig.model,
-        input: text,
-        voice: config.ttsConfig.voice,
-        speed: config.ttsConfig.speed,
-      });
+      let audioBuffer: ArrayBuffer;
+      const { markdownToTxt } = require("markdown-to-txt");
+      const textContent = markdownToTxt(text);
+      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
+        const edgeVoiceName = accessStore.edgeVoiceName();
+        const tts = new MsEdgeTTS();
+        await tts.setMetadata(
+          edgeVoiceName,
+          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
+        );
+        audioBuffer = await tts.toArrayBuffer(textContent);
+      } else {
+        audioBuffer = await api.llm.speech({
+          model: config.ttsConfig.model,
+          input: textContent,
+          voice: config.ttsConfig.voice,
+          speed: config.ttsConfig.speed,
+        });
+      }
      setSpeechStatus(true);
      ttsPlayer
        .play(audioBuffer, () => {
--- a/app/components/tts-config.tsx
+++ b/app/components/tts-config.tsx
@ -2,7 +2,12 @@ import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";

 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
-import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant";
+import {
+  DEFAULT_TTS_ENGINE,
+  DEFAULT_TTS_ENGINES,
+  DEFAULT_TTS_MODELS,
+  DEFAULT_TTS_VOICES,
+} from "../constant";
 import { InputRange } from "./input-range";

 export function TTSConfigList(props: {
@ -39,66 +44,89 @@ export function TTSConfigList(props: {
          }
        ></input>
      </ListItem> */}
-      <ListItem title={Locale.Settings.TTS.Model}>
+      <ListItem title={Locale.Settings.TTS.Engine}>
        <Select
-          value={props.ttsConfig.model}
+          value={props.ttsConfig.engine}
          onChange={(e) => {
            props.updateConfig(
              (config) =>
-                (config.model = TTSConfigValidator.model(
+                (config.engine = TTSConfigValidator.engine(
                  e.currentTarget.value,
                )),
            );
          }}
        >
-          {DEFAULT_TTS_MODELS.map((v, i) => (
+          {DEFAULT_TTS_ENGINES.map((v, i) => (
            <option value={v} key={i}>
              {v}
            </option>
          ))}
        </Select>
      </ListItem>
-      <ListItem
-        title={Locale.Settings.TTS.Voice.Title}
-        subTitle={Locale.Settings.TTS.Voice.SubTitle}
-      >
-        <Select
-          value={props.ttsConfig.voice}
-          onChange={(e) => {
-            props.updateConfig(
-              (config) =>
-                (config.voice = TTSConfigValidator.voice(
-                  e.currentTarget.value,
-                )),
-            );
-          }}
-        >
-          {DEFAULT_TTS_VOICES.map((v, i) => (
-            <option value={v} key={i}>
-              {v}
-            </option>
-          ))}
-        </Select>
-      </ListItem>
-      <ListItem
-        title={Locale.Settings.TTS.Speed.Title}
-        subTitle={Locale.Settings.TTS.Speed.SubTitle}
-      >
-        <InputRange
-          value={props.ttsConfig.speed?.toFixed(1)}
-          min="0.3"
-          max="4.0"
-          step="0.1"
-          onChange={(e) => {
-            props.updateConfig(
-              (config) =>
-                (config.speed = TTSConfigValidator.speed(
-                  e.currentTarget.valueAsNumber,
-                )),
-            );
-          }}
-        ></InputRange>
-      </ListItem>
+      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
+        <>
+          <ListItem title={Locale.Settings.TTS.Model}>
+            <Select
+              value={props.ttsConfig.model}
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.model = TTSConfigValidator.model(
+                      e.currentTarget.value,
+                    )),
+                );
+              }}
+            >
+              {DEFAULT_TTS_MODELS.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+            </Select>
+          </ListItem>
+          <ListItem
+            title={Locale.Settings.TTS.Voice.Title}
+            subTitle={Locale.Settings.TTS.Voice.SubTitle}
+          >
+            <Select
+              value={props.ttsConfig.voice}
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.voice = TTSConfigValidator.voice(
+                      e.currentTarget.value,
+                    )),
+                );
+              }}
+            >
+              {DEFAULT_TTS_VOICES.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+            </Select>
+          </ListItem>
+          <ListItem
+            title={Locale.Settings.TTS.Speed.Title}
+            subTitle={Locale.Settings.TTS.Speed.SubTitle}
+          >
+            <InputRange
+              value={props.ttsConfig.speed?.toFixed(1)}
+              min="0.3"
+              max="4.0"
+              step="0.1"
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.speed = TTSConfigValidator.speed(
+                      e.currentTarget.valueAsNumber,
+                    )),
+                );
+              }}
+            ></InputRange>
+          </ListItem>
+        </>
+      )}
    </>
  );
 }
--- a/app/config/server.ts
+++ b/app/config/server.ts
@ -151,5 +151,7 @@ export const getServerSideConfig = () => {
    ragChunkOverlap: process.env.RAG_CHUNK_OVERLAP ?? "200",
    ragReturnCount: process.env.RAG_RETURN_COUNT ?? "4",
    allowedWebDevEndpoints,
+
+    edgeTTSVoiceName: process.env.EDGE_TTS_VOICE_NAME ?? "zh-CN-YunxiNeural",
  };
 };
--- a/app/constant.ts
+++ b/app/constant.ts
@ -142,6 +142,8 @@ export const KnowledgeCutOffDate: Record<string, string> = {
  "gemini-pro-vision": "2023-12",
 };

+export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
+export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
 export const DEFAULT_TTS_MODEL = "tts-1";
 export const DEFAULT_TTS_VOICE = "alloy";
 export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@ -399,13 +399,14 @@ const cn = {
    TTS: {
      Enable: {
        Title: "启用文本转语音",
-        SubTitle: "启用基于 OpenAI 的文本生成语音服务",
+        SubTitle: "启用文本生成语音服务",
      },
      Autoplay: {
        Title: "启用自动朗读",
        SubTitle: "自动生成语音并播放，需先开启文本转语音开关",
      },
      Model: "模型",
+      Engine: "转换引擎",
      Voice: {
        Title: "声音",
        SubTitle: "生成语音时使用的声音",
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@ -405,7 +405,7 @@ const en: LocaleType = {
    TTS: {
      Enable: {
        Title: "Enable TTS",
-        SubTitle: "Enable text-to-speech service based on OpenAI",
+        SubTitle: "Enable text-to-speech service",
      },
      Autoplay: {
        Title: "Enable Autoplay",
@ -421,6 +421,7 @@ const en: LocaleType = {
        Title: "Speed",
        SubTitle: "The speed of the generated audio",
      },
+      Engine: "TTS Engine",
    },
    STT: {
      Enable: {
--- a/app/store/access.ts
+++ b/app/store/access.ts
@ -51,6 +51,9 @@ const DEFAULT_ACCESS_STATE = {
  customModels: "",
  isEnableRAG: false,
  defaultModel: "",
+
+  // tts config
+  edgeTTSVoiceName: "zh-CN-YunxiNeural",
 };

 export const useAccessStore = createPersistStore(
@ -63,6 +66,12 @@ export const useAccessStore = createPersistStore(
      return get().needCode;
    },

+    edgeVoiceName() {
+      this.fetch();
+
+      return get().edgeTTSVoiceName;
+    },
+
    enableRAG() {
      this.fetch();

--- a/app/store/config.ts
+++ b/app/store/config.ts
@ -6,6 +6,8 @@ import {
  DEFAULT_SIDEBAR_WIDTH,
  DEFAULT_STT_ENGINE,
  DEFAULT_STT_ENGINES,
+  DEFAULT_TTS_ENGINE,
+  DEFAULT_TTS_ENGINES,
  DEFAULT_TTS_MODEL,
  DEFAULT_TTS_MODELS,
  DEFAULT_TTS_VOICE,
@ -17,6 +19,7 @@ import { createPersistStore } from "../utils/store";
 export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
+export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];

 export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];

@ -79,6 +82,7 @@ export const DEFAULT_CONFIG = {
  ttsConfig: {
    enable: false,
    autoplay: false,
+    engine: DEFAULT_TTS_ENGINE,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
@ -111,6 +115,9 @@ export function limitNumber(
 }

 export const TTSConfigValidator = {
+  engine(x: string) {
+    return x as TTSEngineType;
+  },
  model(x: string) {
    return x as TTSModelType;
  },
--- a/app/utils/ms_edge_tts.ts
+++ b/app/utils/ms_edge_tts.ts
@ -0,0 +1,378 @@
+import axios from "axios";
+import { Buffer } from "buffer";
+import { randomBytes } from "crypto";
+import { Readable } from "stream";
+
+// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
+ */
+export enum VOLUME {
+  SILENT = "silent",
+  X_SOFT = "x-soft",
+  SOFT = "soft",
+  MEDIUM = "medium",
+  LOUD = "loud",
+  X_LOUD = "x-LOUD",
+  DEFAULT = "default",
+}
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
+ */
+export enum RATE {
+  X_SLOW = "x-slow",
+  SLOW = "slow",
+  MEDIUM = "medium",
+  FAST = "fast",
+  X_FAST = "x-fast",
+  DEFAULT = "default",
+}
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
+ */
+export enum PITCH {
+  X_LOW = "x-low",
+  LOW = "low",
+  MEDIUM = "medium",
+  HIGH = "high",
+  X_HIGH = "x-high",
+  DEFAULT = "default",
+}
+
+/**
+ * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
+ */
+export enum OUTPUT_FORMAT {
+  // Streaming =============================
+  // AMR_WB_16000HZ = "amr-wb-16000hz",
+  // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
+  // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
+  // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
+  // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
+  // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
+  // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
+  AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
+  AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
+  // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
+  // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
+  // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
+  // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
+  // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
+  // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
+  // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
+  // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
+  // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
+  // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
+  // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
+  // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
+  // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
+  // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
+  // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
+  // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
+  // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
+  // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
+  WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
+  // Non-streaming =============================
+  // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
+  // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
+  // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
+  // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
+  // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
+  // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
+  // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
+}
+
+export type Voice = {
+  Name: string;
+  ShortName: string;
+  Gender: string;
+  Locale: string;
+  SuggestedCodec: string;
+  FriendlyName: string;
+  Status: string;
+};
+
+export class ProsodyOptions {
+  /**
+   * The pitch to use.
+   * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
+   */
+  pitch?: PITCH | string = "+0Hz";
+  /**
+   * The rate to use.
+   * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
+   */
+  rate?: RATE | string | number = 1.0;
+  /**
+   * The volume to use.
+   * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
+   */
+  volume?: VOLUME | string | number = 100.0;
+}
+
+export class MsEdgeTTS {
+  static OUTPUT_FORMAT = OUTPUT_FORMAT;
+  private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
+  private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
+  private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
+  private static BINARY_DELIM = "Path:audio\r\n";
+  private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
+  private readonly _enableLogger;
+  private _ws: WebSocket | undefined;
+  private _voice: any;
+  private _voiceLocale: any;
+  private _outputFormat: any;
+  private _streams: { [key: string]: Readable } = {};
+  private _startTime = 0;
+
+  private _log(...o: any[]) {
+    if (this._enableLogger) {
+      console.log(...o);
+    }
+  }
+
+  /**
+   * Create a new `MsEdgeTTS` instance.
+   *
+   * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
+   * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
+   */
+  public constructor(enableLogger: boolean = false) {
+    this._enableLogger = enableLogger;
+  }
+
+  private async _send(message: any) {
+    for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
+      if (i == 1) {
+        this._startTime = Date.now();
+      }
+      this._log("connecting: ", i);
+      await this._initClient();
+    }
+    this._ws!.send(message);
+  }
+
+  private _initClient() {
+    this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
+
+    this._ws.binaryType = "arraybuffer";
+    return new Promise((resolve, reject) => {
+      this._ws!.onopen = () => {
+        this._log(
+          "Connected in",
+          (Date.now() - this._startTime) / 1000,
+          "seconds",
+        );
+        this._send(
+          `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
+                    {
+                        "context": {
+                            "synthesis": {
+                                "audio": {
+                                    "metadataoptions": {
+                                        "sentenceBoundaryEnabled": "false",
+                                        "wordBoundaryEnabled": "false"
+                                    },
+                                    "outputFormat": "${this._outputFormat}" 
+                                }
+                            }
+                        }
+                    }
+                `,
+        ).then(resolve);
+      };
+      this._ws!.onmessage = (m: any) => {
+        const buffer = Buffer.from(m.data as ArrayBuffer);
+        const message = buffer.toString();
+        const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
+        if (message.includes("Path:turn.start")) {
+          // start of turn, ignore
+        } else if (message.includes("Path:turn.end")) {
+          // end of turn, close stream
+          this._streams[requestId].push(null);
+        } else if (message.includes("Path:response")) {
+          // context response, ignore
+        } else if (
+          message.includes("Path:audio") &&
+          m.data instanceof ArrayBuffer
+        ) {
+          this._pushAudioData(buffer, requestId);
+        } else {
+          this._log("UNKNOWN MESSAGE", message);
+        }
+      };
+      this._ws!.onclose = () => {
+        this._log(
+          "disconnected after:",
+          (Date.now() - this._startTime) / 1000,
+          "seconds",
+        );
+        for (const requestId in this._streams) {
+          this._streams[requestId].push(null);
+        }
+      };
+      this._ws!.onerror = function (error: any) {
+        reject("Connect Error: " + error);
+      };
+    });
+  }
+
+  private _pushAudioData(audioBuffer: Buffer, requestId: string) {
+    const audioStartIndex =
+      audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
+      MsEdgeTTS.BINARY_DELIM.length;
+    const audioData = audioBuffer.subarray(audioStartIndex);
+    this._streams[requestId].push(audioData);
+    this._log("received audio chunk, size: ", audioData?.length);
+  }
+
+  private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
+    // in case future updates to the edge API block these elements, we'll be concatenating strings.
+    options = { ...new ProsodyOptions(), ...options };
+    return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
+                <voice name="${this._voice}">
+                    <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
+                        ${input}
+                    </prosody> 
+                </voice>
+            </speak>`;
+  }
+
+  /**
+   * Fetch the list of voices available in Microsoft Edge.
+   * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
+   */
+  getVoices(): Promise<Voice[]> {
+    return new Promise((resolve, reject) => {
+      axios
+        .get(MsEdgeTTS.VOICES_URL)
+        .then((res) => resolve(res.data))
+        .catch(reject);
+    });
+  }
+
+  /**
+   * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
+   * Must be called at least once before text can be synthesised.
+   * Saved in this instance. Can be called at any time times to update the metadata.
+   *
+   * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
+   * @param outputFormat any {@link OUTPUT_FORMAT}
+   * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
+   */
+  async setMetadata(
+    voiceName: string,
+    outputFormat: OUTPUT_FORMAT,
+    voiceLocale?: string,
+  ) {
+    const oldVoice = this._voice;
+    const oldVoiceLocale = this._voiceLocale;
+    const oldOutputFormat = this._outputFormat;
+
+    this._voice = voiceName;
+    this._voiceLocale = voiceLocale;
+    if (!this._voiceLocale) {
+      const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
+      if (!voiceLangMatch)
+        throw new Error("Could not infer voiceLocale from voiceName!");
+      this._voiceLocale = voiceLangMatch[0];
+    }
+    this._outputFormat = outputFormat;
+
+    const changed =
+      oldVoice !== this._voice ||
+      oldVoiceLocale !== this._voiceLocale ||
+      oldOutputFormat !== this._outputFormat;
+
+    // create new client
+    if (changed || this._ws!.readyState !== this._ws!.OPEN) {
+      this._startTime = Date.now();
+      await this._initClient();
+    }
+  }
+
+  private _metadataCheck() {
+    if (!this._ws)
+      throw new Error(
+        "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
+      );
+  }
+
+  /**
+   * Close the WebSocket connection.
+   */
+  close() {
+    this._ws!.close();
+  }
+
+  /**
+   * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
+   *
+   * @param input the text to synthesise. Can include SSML elements.
+   * @param options (optional) {@link ProsodyOptions}
+   * @returns {Readable} - a `stream.Readable` with the audio data
+   */
+  toStream(input: string, options?: ProsodyOptions): Readable {
+    const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
+    return stream;
+  }
+
+  toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
+    return new Promise((resolve, reject) => {
+      let data: Uint8Array[] = [];
+      const readable = this.toStream(input, options);
+      readable.on("data", (chunk) => {
+        data.push(chunk);
+      });
+
+      readable.on("end", () => {
+        resolve(Buffer.concat(data).buffer);
+      });
+
+      readable.on("error", (err) => {
+        reject(err);
+      });
+    });
+  }
+
+  /**
+   * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
+   *
+   * @param requestSSML the SSML to send. SSML elements required in order to work.
+   * @returns {Readable} - a `stream.Readable` with the audio data
+   */
+  rawToStream(requestSSML: string): Readable {
+    const { stream } = this._rawSSMLRequest(requestSSML);
+    return stream;
+  }
+
+  private _rawSSMLRequest(requestSSML: string): {
+    stream: Readable;
+    requestId: string;
+  } {
+    this._metadataCheck();
+
+    const requestId = randomBytes(16).toString("hex");
+    const request =
+      `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
+                ` + requestSSML.trim();
+    // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
+    const self = this;
+    const stream = new Readable({
+      read() {},
+      destroy(error: Error | null, callback: (error: Error | null) => void) {
+        delete self._streams[requestId];
+        callback(error);
+      },
+    });
+    this._streams[requestId] = stream;
+    this._send(request).then();
+    return { stream, requestId };
+  }
+}
--- a/package.json
+++ b/package.json
@ -38,13 +38,14 @@
    "encoding": "^0.1.13",
    "epub2": "^3.0.2",
    "fuse.js": "^7.0.0",
-    "html-entities": "^2.4.0",
    "heic2any": "^0.0.4",
+    "html-entities": "^2.4.0",
    "html-to-image": "^1.11.11",
    "html-to-text": "^9.0.5",
    "https-proxy-agent": "^7.0.2",
    "langchain": "0.1.37",
    "mammoth": "^1.7.1",
+    "markdown-to-txt": "^2.0.1",
    "md5": "^2.3.0",
    "mermaid": "^10.6.1",
    "mime": "^4.0.1",
--- a/yarn.lock
+++ b/yarn.lock
@ -6231,11 +6231,21 @@ lodash.debounce@^4.0.8:
  resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz"
  integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==

+lodash.escape@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98"
+  integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==
+
 lodash.merge@^4.6.2:
  version "4.6.2"
  resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz"
  integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==

+lodash.unescape@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c"
+  integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg==
+
 lodash@^4.17.21:
  version "4.17.21"
  resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz"
@ -6324,6 +6334,20 @@ markdown-table@^3.0.0:
  resolved "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz"
  integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==

+markdown-to-txt@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5"
+  integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw==
+  dependencies:
+    lodash.escape "^4.0.1"
+    lodash.unescape "^4.0.1"
+    marked "^4.0.14"
+
+marked@^4.0.14:
+  version "4.3.0"
+  resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
+  integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
+
 md5@^2.3.0:
  version "2.3.0"
  resolved "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz"