diff --git a/.env.template b/.env.template index da658ae47..acfc08bc4 100644 --- a/.env.template +++ b/.env.template @@ -92,3 +92,8 @@ ANTHROPIC_URL= ### (optional) WHITE_WEBDEV_ENDPOINTS= + +# (optional) +# Default: zh-CN-YunxiNeural +# voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) +EDGE_TTS_VOICE_NAME= \ No newline at end of file diff --git a/README.md b/README.md index 20eb05d46..0364803ec 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,10 @@ - 除插件工具外,与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能) -- 支持 OpenAI TTS(文本转语音)https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 +- 支持 TTS (文本转语音) + - (免费) Edge TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/266 + - 环境变量(可选):`EDGE_TTS_VOICE_NAME` + - (收费) OpenAI TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 - 支持语音输入,需要使用 HTTPS 访问 https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 @@ -291,6 +294,10 @@ anthropic claude Api Url. ### `DEFAULT_INPUT_TEMPLATE` (可选) 自定义默认的 template,用于初始化『设置』中的『用户输入预处理』配置项 +### `EDGE_TTS_VOICE_NAME` (可选) +配置 Edge TTS 使用的语音声音,默认为:zh-CN-YunxiNeural +可访问 https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages 查看支持的参数 + ## 部署 ### 容器部署 (推荐) diff --git a/app/api/config/route.ts b/app/api/config/route.ts index 59c59fae4..0cc049d45 100644 --- a/app/api/config/route.ts +++ b/app/api/config/route.ts @@ -15,6 +15,7 @@ const DANGER_CONFIG = { customModels: serverConfig.customModels, isEnableRAG: serverConfig.isEnableRAG, defaultModel: serverConfig.defaultModel, + edgeTTSVoiceName: serverConfig.edgeTTSVoiceName, }; declare global { diff --git a/app/components/chat.tsx b/app/components/chat.tsx index d61df2f88..9db721600 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -95,6 +95,7 @@ import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, DEFAULT_STT_ENGINE, + DEFAULT_TTS_ENGINE, FIREFOX_DEFAULT_STT_ENGINE, LAST_INPUT_KEY, ModelProvider, @@ -119,6 +120,7 @@ import { WebTranscriptionApi, } from "../utils/speech"; import { FileInfo } from "../client/platforms/utils"; +import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; const ttsPlayer = createTTSPlayer(); @@ -1086,12 +1088,25 @@ function _Chat() { const config = useAppConfig.getState(); setSpeechLoading(true); ttsPlayer.init(); - const audioBuffer = await api.llm.speech({ - model: config.ttsConfig.model, - input: text, - voice: config.ttsConfig.voice, - speed: config.ttsConfig.speed, - }); + let audioBuffer: ArrayBuffer; + const { markdownToTxt } = require("markdown-to-txt"); + const textContent = markdownToTxt(text); + if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + const edgeVoiceName = accessStore.edgeVoiceName(); + const tts = new MsEdgeTTS(); + await tts.setMetadata( + edgeVoiceName, + OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, + ); + audioBuffer = await tts.toArrayBuffer(textContent); + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + } setSpeechStatus(true); ttsPlayer .play(audioBuffer, () => { diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx index 2fff433c6..f86e3bc52 100644 --- a/app/components/tts-config.tsx +++ b/app/components/tts-config.tsx @@ -2,7 +2,12 @@ import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store"; import Locale from "../locales"; import { ListItem, Select } from "./ui-lib"; -import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant"; +import { + DEFAULT_TTS_ENGINE, + DEFAULT_TTS_ENGINES, + DEFAULT_TTS_MODELS, + DEFAULT_TTS_VOICES, +} from "../constant"; import { InputRange } from "./input-range"; export function TTSConfigList(props: { @@ -39,66 +44,89 @@ export function TTSConfigList(props: { } > */} - + - - - - - { - props.updateConfig( - (config) => - (config.speed = TTSConfigValidator.speed( - e.currentTarget.valueAsNumber, - )), - ); - }} - > - + {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && ( + <> + + + + + + + + { + props.updateConfig( + (config) => + (config.speed = TTSConfigValidator.speed( + e.currentTarget.valueAsNumber, + )), + ); + }} + > + + + )} ); } diff --git a/app/config/server.ts b/app/config/server.ts index 86ecb3c5c..6ae0fcdef 100644 --- a/app/config/server.ts +++ b/app/config/server.ts @@ -151,5 +151,7 @@ export const getServerSideConfig = () => { ragChunkOverlap: process.env.RAG_CHUNK_OVERLAP ?? "200", ragReturnCount: process.env.RAG_RETURN_COUNT ?? "4", allowedWebDevEndpoints, + + edgeTTSVoiceName: process.env.EDGE_TTS_VOICE_NAME ?? "zh-CN-YunxiNeural", }; }; diff --git a/app/constant.ts b/app/constant.ts index 5f717a73f..827def8c5 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -142,6 +142,8 @@ export const KnowledgeCutOffDate: Record = { "gemini-pro-vision": "2023-12", }; +export const DEFAULT_TTS_ENGINE = "OpenAI-TTS"; +export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"]; export const DEFAULT_TTS_MODEL = "tts-1"; export const DEFAULT_TTS_VOICE = "alloy"; export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"]; diff --git a/app/locales/cn.ts b/app/locales/cn.ts index 9064df08d..633c3f25f 100644 --- a/app/locales/cn.ts +++ b/app/locales/cn.ts @@ -399,13 +399,14 @@ const cn = { TTS: { Enable: { Title: "启用文本转语音", - SubTitle: "启用基于 OpenAI 的文本生成语音服务", + SubTitle: "启用文本生成语音服务", }, Autoplay: { Title: "启用自动朗读", SubTitle: "自动生成语音并播放,需先开启文本转语音开关", }, Model: "模型", + Engine: "转换引擎", Voice: { Title: "声音", SubTitle: "生成语音时使用的声音", diff --git a/app/locales/en.ts b/app/locales/en.ts index 74f584cc9..4a1a335ba 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -405,7 +405,7 @@ const en: LocaleType = { TTS: { Enable: { Title: "Enable TTS", - SubTitle: "Enable text-to-speech service based on OpenAI", + SubTitle: "Enable text-to-speech service", }, Autoplay: { Title: "Enable Autoplay", @@ -421,6 +421,7 @@ const en: LocaleType = { Title: "Speed", SubTitle: "The speed of the generated audio", }, + Engine: "TTS Engine", }, STT: { Enable: { diff --git a/app/store/access.ts b/app/store/access.ts index 2f74a82e0..eac9078d5 100644 --- a/app/store/access.ts +++ b/app/store/access.ts @@ -51,6 +51,9 @@ const DEFAULT_ACCESS_STATE = { customModels: "", isEnableRAG: false, defaultModel: "", + + // tts config + edgeTTSVoiceName: "zh-CN-YunxiNeural", }; export const useAccessStore = createPersistStore( @@ -63,6 +66,12 @@ export const useAccessStore = createPersistStore( return get().needCode; }, + edgeVoiceName() { + this.fetch(); + + return get().edgeTTSVoiceName; + }, + enableRAG() { this.fetch(); diff --git a/app/store/config.ts b/app/store/config.ts index b1468f3db..666300952 100644 --- a/app/store/config.ts +++ b/app/store/config.ts @@ -6,6 +6,8 @@ import { DEFAULT_SIDEBAR_WIDTH, DEFAULT_STT_ENGINE, DEFAULT_STT_ENGINES, + DEFAULT_TTS_ENGINE, + DEFAULT_TTS_ENGINES, DEFAULT_TTS_MODEL, DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICE, @@ -17,6 +19,7 @@ import { createPersistStore } from "../utils/store"; export type ModelType = (typeof DEFAULT_MODELS)[number]["name"]; export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; +export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number]; export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number]; @@ -79,6 +82,7 @@ export const DEFAULT_CONFIG = { ttsConfig: { enable: false, autoplay: false, + engine: DEFAULT_TTS_ENGINE, model: DEFAULT_TTS_MODEL, voice: DEFAULT_TTS_VOICE, speed: 1.0, @@ -111,6 +115,9 @@ export function limitNumber( } export const TTSConfigValidator = { + engine(x: string) { + return x as TTSEngineType; + }, model(x: string) { return x as TTSModelType; }, diff --git a/app/utils/ms_edge_tts.ts b/app/utils/ms_edge_tts.ts new file mode 100644 index 000000000..2333131b0 --- /dev/null +++ b/app/utils/ms_edge_tts.ts @@ -0,0 +1,378 @@ +import axios from "axios"; +import { Buffer } from "buffer"; +import { randomBytes } from "crypto"; +import { Readable } from "stream"; + +// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume + */ +export enum VOLUME { + SILENT = "silent", + X_SOFT = "x-soft", + SOFT = "soft", + MEDIUM = "medium", + LOUD = "loud", + X_LOUD = "x-LOUD", + DEFAULT = "default", +} + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking + */ +export enum RATE { + X_SLOW = "x-slow", + SLOW = "slow", + MEDIUM = "medium", + FAST = "fast", + X_FAST = "x-fast", + DEFAULT = "default", +} + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline + */ +export enum PITCH { + X_LOW = "x-low", + LOW = "low", + MEDIUM = "medium", + HIGH = "high", + X_HIGH = "x-high", + DEFAULT = "default", +} + +/** + * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted. + */ +export enum OUTPUT_FORMAT { + // Streaming ============================= + // AMR_WB_16000HZ = "amr-wb-16000hz", + // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus", + // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3", + // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3", + // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3", + // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus", + // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus", + AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3", + AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3", + // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3", + // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3", + // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3", + // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus", + // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus", + // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus", + // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw", + // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw", + // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm", + // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm", + // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk", + // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm", + // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm", + // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk", + // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm", + // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm", + // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus", + // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus", + WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus", + // Non-streaming ============================= + // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw", + // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw", + // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm", + // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm", + // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm", + // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm", + // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm", +} + +export type Voice = { + Name: string; + ShortName: string; + Gender: string; + Locale: string; + SuggestedCodec: string; + FriendlyName: string; + Status: string; +}; + +export class ProsodyOptions { + /** + * The pitch to use. + * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline) + */ + pitch?: PITCH | string = "+0Hz"; + /** + * The rate to use. + * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking) + */ + rate?: RATE | string | number = 1.0; + /** + * The volume to use. + * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume) + */ + volume?: VOLUME | string | number = 100.0; +} + +export class MsEdgeTTS { + static OUTPUT_FORMAT = OUTPUT_FORMAT; + private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; + private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; + private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; + private static BINARY_DELIM = "Path:audio\r\n"; + private static VOICE_LANG_REGEX = /\w{2}-\w{2}/; + private readonly _enableLogger; + private _ws: WebSocket | undefined; + private _voice: any; + private _voiceLocale: any; + private _outputFormat: any; + private _streams: { [key: string]: Readable } = {}; + private _startTime = 0; + + private _log(...o: any[]) { + if (this._enableLogger) { + console.log(...o); + } + } + + /** + * Create a new `MsEdgeTTS` instance. + * + * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent). + * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console + */ + public constructor(enableLogger: boolean = false) { + this._enableLogger = enableLogger; + } + + private async _send(message: any) { + for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) { + if (i == 1) { + this._startTime = Date.now(); + } + this._log("connecting: ", i); + await this._initClient(); + } + this._ws!.send(message); + } + + private _initClient() { + this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL); + + this._ws.binaryType = "arraybuffer"; + return new Promise((resolve, reject) => { + this._ws!.onopen = () => { + this._log( + "Connected in", + (Date.now() - this._startTime) / 1000, + "seconds", + ); + this._send( + `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n + { + "context": { + "synthesis": { + "audio": { + "metadataoptions": { + "sentenceBoundaryEnabled": "false", + "wordBoundaryEnabled": "false" + }, + "outputFormat": "${this._outputFormat}" + } + } + } + } + `, + ).then(resolve); + }; + this._ws!.onmessage = (m: any) => { + const buffer = Buffer.from(m.data as ArrayBuffer); + const message = buffer.toString(); + const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1]; + if (message.includes("Path:turn.start")) { + // start of turn, ignore + } else if (message.includes("Path:turn.end")) { + // end of turn, close stream + this._streams[requestId].push(null); + } else if (message.includes("Path:response")) { + // context response, ignore + } else if ( + message.includes("Path:audio") && + m.data instanceof ArrayBuffer + ) { + this._pushAudioData(buffer, requestId); + } else { + this._log("UNKNOWN MESSAGE", message); + } + }; + this._ws!.onclose = () => { + this._log( + "disconnected after:", + (Date.now() - this._startTime) / 1000, + "seconds", + ); + for (const requestId in this._streams) { + this._streams[requestId].push(null); + } + }; + this._ws!.onerror = function (error: any) { + reject("Connect Error: " + error); + }; + }); + } + + private _pushAudioData(audioBuffer: Buffer, requestId: string) { + const audioStartIndex = + audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + + MsEdgeTTS.BINARY_DELIM.length; + const audioData = audioBuffer.subarray(audioStartIndex); + this._streams[requestId].push(audioData); + this._log("received audio chunk, size: ", audioData?.length); + } + + private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string { + // in case future updates to the edge API block these elements, we'll be concatenating strings. + options = { ...new ProsodyOptions(), ...options }; + return ` + + + ${input} + + + `; + } + + /** + * Fetch the list of voices available in Microsoft Edge. + * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). + */ + getVoices(): Promise { + return new Promise((resolve, reject) => { + axios + .get(MsEdgeTTS.VOICES_URL) + .then((res) => resolve(res.data)) + .catch(reject); + }); + } + + /** + * Sets the required information for the speech to be synthesised and inits a new WebSocket connection. + * Must be called at least once before text can be synthesised. + * Saved in this instance. Can be called at any time times to update the metadata. + * + * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) + * @param outputFormat any {@link OUTPUT_FORMAT} + * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` + */ + async setMetadata( + voiceName: string, + outputFormat: OUTPUT_FORMAT, + voiceLocale?: string, + ) { + const oldVoice = this._voice; + const oldVoiceLocale = this._voiceLocale; + const oldOutputFormat = this._outputFormat; + + this._voice = voiceName; + this._voiceLocale = voiceLocale; + if (!this._voiceLocale) { + const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice); + if (!voiceLangMatch) + throw new Error("Could not infer voiceLocale from voiceName!"); + this._voiceLocale = voiceLangMatch[0]; + } + this._outputFormat = outputFormat; + + const changed = + oldVoice !== this._voice || + oldVoiceLocale !== this._voiceLocale || + oldOutputFormat !== this._outputFormat; + + // create new client + if (changed || this._ws!.readyState !== this._ws!.OPEN) { + this._startTime = Date.now(); + await this._initClient(); + } + } + + private _metadataCheck() { + if (!this._ws) + throw new Error( + "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.", + ); + } + + /** + * Close the WebSocket connection. + */ + close() { + this._ws!.close(); + } + + /** + * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. + * + * @param input the text to synthesise. Can include SSML elements. + * @param options (optional) {@link ProsodyOptions} + * @returns {Readable} - a `stream.Readable` with the audio data + */ + toStream(input: string, options?: ProsodyOptions): Readable { + const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options)); + return stream; + } + + toArrayBuffer(input: string, options?: ProsodyOptions): Promise { + return new Promise((resolve, reject) => { + let data: Uint8Array[] = []; + const readable = this.toStream(input, options); + readable.on("data", (chunk) => { + data.push(chunk); + }); + + readable.on("end", () => { + resolve(Buffer.concat(data).buffer); + }); + + readable.on("error", (err) => { + reject(err); + }); + }); + } + + /** + * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. + * + * @param requestSSML the SSML to send. SSML elements required in order to work. + * @returns {Readable} - a `stream.Readable` with the audio data + */ + rawToStream(requestSSML: string): Readable { + const { stream } = this._rawSSMLRequest(requestSSML); + return stream; + } + + private _rawSSMLRequest(requestSSML: string): { + stream: Readable; + requestId: string; + } { + this._metadataCheck(); + + const requestId = randomBytes(16).toString("hex"); + const request = + `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n + ` + requestSSML.trim(); + // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup + const self = this; + const stream = new Readable({ + read() {}, + destroy(error: Error | null, callback: (error: Error | null) => void) { + delete self._streams[requestId]; + callback(error); + }, + }); + this._streams[requestId] = stream; + this._send(request).then(); + return { stream, requestId }; + } +} diff --git a/package.json b/package.json index f65385450..9e27aa6c6 100644 --- a/package.json +++ b/package.json @@ -38,13 +38,14 @@ "encoding": "^0.1.13", "epub2": "^3.0.2", "fuse.js": "^7.0.0", - "html-entities": "^2.4.0", "heic2any": "^0.0.4", + "html-entities": "^2.4.0", "html-to-image": "^1.11.11", "html-to-text": "^9.0.5", "https-proxy-agent": "^7.0.2", "langchain": "0.1.37", "mammoth": "^1.7.1", + "markdown-to-txt": "^2.0.1", "md5": "^2.3.0", "mermaid": "^10.6.1", "mime": "^4.0.1", diff --git a/yarn.lock b/yarn.lock index 5c5210232..a09347ec9 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6231,11 +6231,21 @@ lodash.debounce@^4.0.8: resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz" integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow== +lodash.escape@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98" + integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw== + lodash.merge@^4.6.2: version "4.6.2" resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz" integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ== +lodash.unescape@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c" + integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg== + lodash@^4.17.21: version "4.17.21" resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" @@ -6324,6 +6334,20 @@ markdown-table@^3.0.0: resolved "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz" integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw== +markdown-to-txt@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5" + integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw== + dependencies: + lodash.escape "^4.0.1" + lodash.unescape "^4.0.1" + marked "^4.0.14" + +marked@^4.0.14: + version "4.3.0" + resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3" + integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A== + md5@^2.3.0: version "2.3.0" resolved "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz"