feat: support edge tts

This commit is contained in:
Hk-Gosuto 2024-05-27 21:25:25 +08:00
parent b1e91ca5cd
commit e759631ba3
14 changed files with 537 additions and 56 deletions

View File

@ -92,3 +92,8 @@ ANTHROPIC_URL=
### (optional) ### (optional)
WHITE_WEBDEV_ENDPOINTS= WHITE_WEBDEV_ENDPOINTS=
# (optional)
# Default: zh-CN-YunxiNeural
# voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
EDGE_TTS_VOICE_NAME=

View File

@ -40,7 +40,10 @@
- 除插件工具外,与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能) - 除插件工具外,与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能)
- 支持 OpenAI TTS文本转语音https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 - 支持 TTS (文本转语音)
- (免费) Edge TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/266
- 环境变量(可选):`EDGE_TTS_VOICE_NAME`
- (收费) OpenAI TTS https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
- 支持语音输入,需要使用 HTTPS 访问 https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 - 支持语音输入,需要使用 HTTPS 访问 https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
@ -291,6 +294,10 @@ anthropic claude Api Url.
### `DEFAULT_INPUT_TEMPLATE` (可选) ### `DEFAULT_INPUT_TEMPLATE` (可选)
自定义默认的 template用于初始化『设置』中的『用户输入预处理』配置项 自定义默认的 template用于初始化『设置』中的『用户输入预处理』配置项
### `EDGE_TTS_VOICE_NAME` (可选)
配置 Edge TTS 使用的语音声音默认为zh-CN-YunxiNeural
可访问 https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages 查看支持的参数
## 部署 ## 部署
### 容器部署 (推荐) ### 容器部署 (推荐)

View File

@ -15,6 +15,7 @@ const DANGER_CONFIG = {
customModels: serverConfig.customModels, customModels: serverConfig.customModels,
isEnableRAG: serverConfig.isEnableRAG, isEnableRAG: serverConfig.isEnableRAG,
defaultModel: serverConfig.defaultModel, defaultModel: serverConfig.defaultModel,
edgeTTSVoiceName: serverConfig.edgeTTSVoiceName,
}; };
declare global { declare global {

View File

@ -95,6 +95,7 @@ import { useNavigate } from "react-router-dom";
import { import {
CHAT_PAGE_SIZE, CHAT_PAGE_SIZE,
DEFAULT_STT_ENGINE, DEFAULT_STT_ENGINE,
DEFAULT_TTS_ENGINE,
FIREFOX_DEFAULT_STT_ENGINE, FIREFOX_DEFAULT_STT_ENGINE,
LAST_INPUT_KEY, LAST_INPUT_KEY,
ModelProvider, ModelProvider,
@ -119,6 +120,7 @@ import {
WebTranscriptionApi, WebTranscriptionApi,
} from "../utils/speech"; } from "../utils/speech";
import { FileInfo } from "../client/platforms/utils"; import { FileInfo } from "../client/platforms/utils";
import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
const ttsPlayer = createTTSPlayer(); const ttsPlayer = createTTSPlayer();
@ -1086,12 +1088,25 @@ function _Chat() {
const config = useAppConfig.getState(); const config = useAppConfig.getState();
setSpeechLoading(true); setSpeechLoading(true);
ttsPlayer.init(); ttsPlayer.init();
const audioBuffer = await api.llm.speech({ let audioBuffer: ArrayBuffer;
model: config.ttsConfig.model, const { markdownToTxt } = require("markdown-to-txt");
input: text, const textContent = markdownToTxt(text);
voice: config.ttsConfig.voice, if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
speed: config.ttsConfig.speed, const edgeVoiceName = accessStore.edgeVoiceName();
}); const tts = new MsEdgeTTS();
await tts.setMetadata(
edgeVoiceName,
OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
);
audioBuffer = await tts.toArrayBuffer(textContent);
} else {
audioBuffer = await api.llm.speech({
model: config.ttsConfig.model,
input: textContent,
voice: config.ttsConfig.voice,
speed: config.ttsConfig.speed,
});
}
setSpeechStatus(true); setSpeechStatus(true);
ttsPlayer ttsPlayer
.play(audioBuffer, () => { .play(audioBuffer, () => {

View File

@ -2,7 +2,12 @@ import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
import Locale from "../locales"; import Locale from "../locales";
import { ListItem, Select } from "./ui-lib"; import { ListItem, Select } from "./ui-lib";
import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant"; import {
DEFAULT_TTS_ENGINE,
DEFAULT_TTS_ENGINES,
DEFAULT_TTS_MODELS,
DEFAULT_TTS_VOICES,
} from "../constant";
import { InputRange } from "./input-range"; import { InputRange } from "./input-range";
export function TTSConfigList(props: { export function TTSConfigList(props: {
@ -39,66 +44,89 @@ export function TTSConfigList(props: {
} }
></input> ></input>
</ListItem> */} </ListItem> */}
<ListItem title={Locale.Settings.TTS.Model}> <ListItem title={Locale.Settings.TTS.Engine}>
<Select <Select
value={props.ttsConfig.model} value={props.ttsConfig.engine}
onChange={(e) => { onChange={(e) => {
props.updateConfig( props.updateConfig(
(config) => (config) =>
(config.model = TTSConfigValidator.model( (config.engine = TTSConfigValidator.engine(
e.currentTarget.value, e.currentTarget.value,
)), )),
); );
}} }}
> >
{DEFAULT_TTS_MODELS.map((v, i) => ( {DEFAULT_TTS_ENGINES.map((v, i) => (
<option value={v} key={i}> <option value={v} key={i}>
{v} {v}
</option> </option>
))} ))}
</Select> </Select>
</ListItem> </ListItem>
<ListItem {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
title={Locale.Settings.TTS.Voice.Title} <>
subTitle={Locale.Settings.TTS.Voice.SubTitle} <ListItem title={Locale.Settings.TTS.Model}>
> <Select
<Select value={props.ttsConfig.model}
value={props.ttsConfig.voice} onChange={(e) => {
onChange={(e) => { props.updateConfig(
props.updateConfig( (config) =>
(config) => (config.model = TTSConfigValidator.model(
(config.voice = TTSConfigValidator.voice( e.currentTarget.value,
e.currentTarget.value, )),
)), );
); }}
}} >
> {DEFAULT_TTS_MODELS.map((v, i) => (
{DEFAULT_TTS_VOICES.map((v, i) => ( <option value={v} key={i}>
<option value={v} key={i}> {v}
{v} </option>
</option> ))}
))} </Select>
</Select> </ListItem>
</ListItem> <ListItem
<ListItem title={Locale.Settings.TTS.Voice.Title}
title={Locale.Settings.TTS.Speed.Title} subTitle={Locale.Settings.TTS.Voice.SubTitle}
subTitle={Locale.Settings.TTS.Speed.SubTitle} >
> <Select
<InputRange value={props.ttsConfig.voice}
value={props.ttsConfig.speed?.toFixed(1)} onChange={(e) => {
min="0.3" props.updateConfig(
max="4.0" (config) =>
step="0.1" (config.voice = TTSConfigValidator.voice(
onChange={(e) => { e.currentTarget.value,
props.updateConfig( )),
(config) => );
(config.speed = TTSConfigValidator.speed( }}
e.currentTarget.valueAsNumber, >
)), {DEFAULT_TTS_VOICES.map((v, i) => (
); <option value={v} key={i}>
}} {v}
></InputRange> </option>
</ListItem> ))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.TTS.Speed.Title}
subTitle={Locale.Settings.TTS.Speed.SubTitle}
>
<InputRange
value={props.ttsConfig.speed?.toFixed(1)}
min="0.3"
max="4.0"
step="0.1"
onChange={(e) => {
props.updateConfig(
(config) =>
(config.speed = TTSConfigValidator.speed(
e.currentTarget.valueAsNumber,
)),
);
}}
></InputRange>
</ListItem>
</>
)}
</> </>
); );
} }

View File

@ -151,5 +151,7 @@ export const getServerSideConfig = () => {
ragChunkOverlap: process.env.RAG_CHUNK_OVERLAP ?? "200", ragChunkOverlap: process.env.RAG_CHUNK_OVERLAP ?? "200",
ragReturnCount: process.env.RAG_RETURN_COUNT ?? "4", ragReturnCount: process.env.RAG_RETURN_COUNT ?? "4",
allowedWebDevEndpoints, allowedWebDevEndpoints,
edgeTTSVoiceName: process.env.EDGE_TTS_VOICE_NAME ?? "zh-CN-YunxiNeural",
}; };
}; };

View File

@ -142,6 +142,8 @@ export const KnowledgeCutOffDate: Record<string, string> = {
"gemini-pro-vision": "2023-12", "gemini-pro-vision": "2023-12",
}; };
export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
export const DEFAULT_TTS_MODEL = "tts-1"; export const DEFAULT_TTS_MODEL = "tts-1";
export const DEFAULT_TTS_VOICE = "alloy"; export const DEFAULT_TTS_VOICE = "alloy";
export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"]; export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];

View File

@ -399,13 +399,14 @@ const cn = {
TTS: { TTS: {
Enable: { Enable: {
Title: "启用文本转语音", Title: "启用文本转语音",
SubTitle: "启用基于 OpenAI 的文本生成语音服务", SubTitle: "启用文本生成语音服务",
}, },
Autoplay: { Autoplay: {
Title: "启用自动朗读", Title: "启用自动朗读",
SubTitle: "自动生成语音并播放,需先开启文本转语音开关", SubTitle: "自动生成语音并播放,需先开启文本转语音开关",
}, },
Model: "模型", Model: "模型",
Engine: "转换引擎",
Voice: { Voice: {
Title: "声音", Title: "声音",
SubTitle: "生成语音时使用的声音", SubTitle: "生成语音时使用的声音",

View File

@ -405,7 +405,7 @@ const en: LocaleType = {
TTS: { TTS: {
Enable: { Enable: {
Title: "Enable TTS", Title: "Enable TTS",
SubTitle: "Enable text-to-speech service based on OpenAI", SubTitle: "Enable text-to-speech service",
}, },
Autoplay: { Autoplay: {
Title: "Enable Autoplay", Title: "Enable Autoplay",
@ -421,6 +421,7 @@ const en: LocaleType = {
Title: "Speed", Title: "Speed",
SubTitle: "The speed of the generated audio", SubTitle: "The speed of the generated audio",
}, },
Engine: "TTS Engine",
}, },
STT: { STT: {
Enable: { Enable: {

View File

@ -51,6 +51,9 @@ const DEFAULT_ACCESS_STATE = {
customModels: "", customModels: "",
isEnableRAG: false, isEnableRAG: false,
defaultModel: "", defaultModel: "",
// tts config
edgeTTSVoiceName: "zh-CN-YunxiNeural",
}; };
export const useAccessStore = createPersistStore( export const useAccessStore = createPersistStore(
@ -63,6 +66,12 @@ export const useAccessStore = createPersistStore(
return get().needCode; return get().needCode;
}, },
edgeVoiceName() {
this.fetch();
return get().edgeTTSVoiceName;
},
enableRAG() { enableRAG() {
this.fetch(); this.fetch();

View File

@ -6,6 +6,8 @@ import {
DEFAULT_SIDEBAR_WIDTH, DEFAULT_SIDEBAR_WIDTH,
DEFAULT_STT_ENGINE, DEFAULT_STT_ENGINE,
DEFAULT_STT_ENGINES, DEFAULT_STT_ENGINES,
DEFAULT_TTS_ENGINE,
DEFAULT_TTS_ENGINES,
DEFAULT_TTS_MODEL, DEFAULT_TTS_MODEL,
DEFAULT_TTS_MODELS, DEFAULT_TTS_MODELS,
DEFAULT_TTS_VOICE, DEFAULT_TTS_VOICE,
@ -17,6 +19,7 @@ import { createPersistStore } from "../utils/store";
export type ModelType = (typeof DEFAULT_MODELS)[number]["name"]; export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number]; export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
@ -79,6 +82,7 @@ export const DEFAULT_CONFIG = {
ttsConfig: { ttsConfig: {
enable: false, enable: false,
autoplay: false, autoplay: false,
engine: DEFAULT_TTS_ENGINE,
model: DEFAULT_TTS_MODEL, model: DEFAULT_TTS_MODEL,
voice: DEFAULT_TTS_VOICE, voice: DEFAULT_TTS_VOICE,
speed: 1.0, speed: 1.0,
@ -111,6 +115,9 @@ export function limitNumber(
} }
export const TTSConfigValidator = { export const TTSConfigValidator = {
engine(x: string) {
return x as TTSEngineType;
},
model(x: string) { model(x: string) {
return x as TTSModelType; return x as TTSModelType;
}, },

378
app/utils/ms_edge_tts.ts Normal file
View File

@ -0,0 +1,378 @@
import axios from "axios";
import { Buffer } from "buffer";
import { randomBytes } from "crypto";
import { Readable } from "stream";
// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
*/
export enum VOLUME {
SILENT = "silent",
X_SOFT = "x-soft",
SOFT = "soft",
MEDIUM = "medium",
LOUD = "loud",
X_LOUD = "x-LOUD",
DEFAULT = "default",
}
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
*/
export enum RATE {
X_SLOW = "x-slow",
SLOW = "slow",
MEDIUM = "medium",
FAST = "fast",
X_FAST = "x-fast",
DEFAULT = "default",
}
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
*/
export enum PITCH {
X_LOW = "x-low",
LOW = "low",
MEDIUM = "medium",
HIGH = "high",
X_HIGH = "x-high",
DEFAULT = "default",
}
/**
* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
*/
export enum OUTPUT_FORMAT {
// Streaming =============================
// AMR_WB_16000HZ = "amr-wb-16000hz",
// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
// Non-streaming =============================
// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
}
export type Voice = {
Name: string;
ShortName: string;
Gender: string;
Locale: string;
SuggestedCodec: string;
FriendlyName: string;
Status: string;
};
export class ProsodyOptions {
/**
* The pitch to use.
* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
*/
pitch?: PITCH | string = "+0Hz";
/**
* The rate to use.
* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
*/
rate?: RATE | string | number = 1.0;
/**
* The volume to use.
* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
*/
volume?: VOLUME | string | number = 100.0;
}
export class MsEdgeTTS {
static OUTPUT_FORMAT = OUTPUT_FORMAT;
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
private static BINARY_DELIM = "Path:audio\r\n";
private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
private readonly _enableLogger;
private _ws: WebSocket | undefined;
private _voice: any;
private _voiceLocale: any;
private _outputFormat: any;
private _streams: { [key: string]: Readable } = {};
private _startTime = 0;
private _log(...o: any[]) {
if (this._enableLogger) {
console.log(...o);
}
}
/**
* Create a new `MsEdgeTTS` instance.
*
* @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
*/
public constructor(enableLogger: boolean = false) {
this._enableLogger = enableLogger;
}
private async _send(message: any) {
for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
if (i == 1) {
this._startTime = Date.now();
}
this._log("connecting: ", i);
await this._initClient();
}
this._ws!.send(message);
}
private _initClient() {
this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
this._ws.binaryType = "arraybuffer";
return new Promise((resolve, reject) => {
this._ws!.onopen = () => {
this._log(
"Connected in",
(Date.now() - this._startTime) / 1000,
"seconds",
);
this._send(
`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
{
"context": {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "false",
"wordBoundaryEnabled": "false"
},
"outputFormat": "${this._outputFormat}"
}
}
}
}
`,
).then(resolve);
};
this._ws!.onmessage = (m: any) => {
const buffer = Buffer.from(m.data as ArrayBuffer);
const message = buffer.toString();
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
if (message.includes("Path:turn.start")) {
// start of turn, ignore
} else if (message.includes("Path:turn.end")) {
// end of turn, close stream
this._streams[requestId].push(null);
} else if (message.includes("Path:response")) {
// context response, ignore
} else if (
message.includes("Path:audio") &&
m.data instanceof ArrayBuffer
) {
this._pushAudioData(buffer, requestId);
} else {
this._log("UNKNOWN MESSAGE", message);
}
};
this._ws!.onclose = () => {
this._log(
"disconnected after:",
(Date.now() - this._startTime) / 1000,
"seconds",
);
for (const requestId in this._streams) {
this._streams[requestId].push(null);
}
};
this._ws!.onerror = function (error: any) {
reject("Connect Error: " + error);
};
});
}
private _pushAudioData(audioBuffer: Buffer, requestId: string) {
const audioStartIndex =
audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
MsEdgeTTS.BINARY_DELIM.length;
const audioData = audioBuffer.subarray(audioStartIndex);
this._streams[requestId].push(audioData);
this._log("received audio chunk, size: ", audioData?.length);
}
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
// in case future updates to the edge API block these elements, we'll be concatenating strings.
options = { ...new ProsodyOptions(), ...options };
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
<voice name="${this._voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${input}
</prosody>
</voice>
</speak>`;
}
/**
* Fetch the list of voices available in Microsoft Edge.
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
*/
getVoices(): Promise<Voice[]> {
return new Promise((resolve, reject) => {
axios
.get(MsEdgeTTS.VOICES_URL)
.then((res) => resolve(res.data))
.catch(reject);
});
}
/**
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
* Must be called at least once before text can be synthesised.
* Saved in this instance. Can be called at any time times to update the metadata.
*
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
* @param outputFormat any {@link OUTPUT_FORMAT}
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
*/
async setMetadata(
voiceName: string,
outputFormat: OUTPUT_FORMAT,
voiceLocale?: string,
) {
const oldVoice = this._voice;
const oldVoiceLocale = this._voiceLocale;
const oldOutputFormat = this._outputFormat;
this._voice = voiceName;
this._voiceLocale = voiceLocale;
if (!this._voiceLocale) {
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
if (!voiceLangMatch)
throw new Error("Could not infer voiceLocale from voiceName!");
this._voiceLocale = voiceLangMatch[0];
}
this._outputFormat = outputFormat;
const changed =
oldVoice !== this._voice ||
oldVoiceLocale !== this._voiceLocale ||
oldOutputFormat !== this._outputFormat;
// create new client
if (changed || this._ws!.readyState !== this._ws!.OPEN) {
this._startTime = Date.now();
await this._initClient();
}
}
private _metadataCheck() {
if (!this._ws)
throw new Error(
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
);
}
/**
* Close the WebSocket connection.
*/
close() {
this._ws!.close();
}
/**
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
*
* @param input the text to synthesise. Can include SSML elements.
* @param options (optional) {@link ProsodyOptions}
* @returns {Readable} - a `stream.Readable` with the audio data
*/
toStream(input: string, options?: ProsodyOptions): Readable {
const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
return stream;
}
toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
let data: Uint8Array[] = [];
const readable = this.toStream(input, options);
readable.on("data", (chunk) => {
data.push(chunk);
});
readable.on("end", () => {
resolve(Buffer.concat(data).buffer);
});
readable.on("error", (err) => {
reject(err);
});
});
}
/**
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
*
* @param requestSSML the SSML to send. SSML elements required in order to work.
* @returns {Readable} - a `stream.Readable` with the audio data
*/
rawToStream(requestSSML: string): Readable {
const { stream } = this._rawSSMLRequest(requestSSML);
return stream;
}
private _rawSSMLRequest(requestSSML: string): {
stream: Readable;
requestId: string;
} {
this._metadataCheck();
const requestId = randomBytes(16).toString("hex");
const request =
`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
` + requestSSML.trim();
// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
const self = this;
const stream = new Readable({
read() {},
destroy(error: Error | null, callback: (error: Error | null) => void) {
delete self._streams[requestId];
callback(error);
},
});
this._streams[requestId] = stream;
this._send(request).then();
return { stream, requestId };
}
}

View File

@ -38,13 +38,14 @@
"encoding": "^0.1.13", "encoding": "^0.1.13",
"epub2": "^3.0.2", "epub2": "^3.0.2",
"fuse.js": "^7.0.0", "fuse.js": "^7.0.0",
"html-entities": "^2.4.0",
"heic2any": "^0.0.4", "heic2any": "^0.0.4",
"html-entities": "^2.4.0",
"html-to-image": "^1.11.11", "html-to-image": "^1.11.11",
"html-to-text": "^9.0.5", "html-to-text": "^9.0.5",
"https-proxy-agent": "^7.0.2", "https-proxy-agent": "^7.0.2",
"langchain": "0.1.37", "langchain": "0.1.37",
"mammoth": "^1.7.1", "mammoth": "^1.7.1",
"markdown-to-txt": "^2.0.1",
"md5": "^2.3.0", "md5": "^2.3.0",
"mermaid": "^10.6.1", "mermaid": "^10.6.1",
"mime": "^4.0.1", "mime": "^4.0.1",

View File

@ -6231,11 +6231,21 @@ lodash.debounce@^4.0.8:
resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz" resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz"
integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow== integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==
lodash.escape@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98"
integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==
lodash.merge@^4.6.2: lodash.merge@^4.6.2:
version "4.6.2" version "4.6.2"
resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz" resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz"
integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ== integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==
lodash.unescape@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c"
integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg==
lodash@^4.17.21: lodash@^4.17.21:
version "4.17.21" version "4.17.21"
resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz"
@ -6324,6 +6334,20 @@ markdown-table@^3.0.0:
resolved "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz" resolved "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz"
integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw== integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==
markdown-to-txt@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5"
integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw==
dependencies:
lodash.escape "^4.0.1"
lodash.unescape "^4.0.1"
marked "^4.0.14"
marked@^4.0.14:
version "4.3.0"
resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
md5@^2.3.0: md5@^2.3.0:
version "2.3.0" version "2.3.0"
resolved "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz" resolved "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz"