diff --git a/README.md b/README.md index 70e4b7418..230184a59 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ - 除插件工具外,与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能) +- 支持 OpenAI TTS(文本转语音)https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208 + - 支持 GPT-4V(视觉) 模型 - 需要配置对象存储服务,请参考 [对象存储服务配置指南](./docs/s3-oss.md) 配置 diff --git a/app/client/api.ts b/app/client/api.ts index 19f4aa53f..84be55dbf 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -13,6 +13,7 @@ export const ROLES = ["system", "user", "assistant"] as const; export type MessageRole = (typeof ROLES)[number]; export const Models = ["gpt-3.5-turbo", "gpt-4"] as const; +export const TTSModels = ["tts-1", "tts-1-hd"] as const; export type ChatModel = ModelType; export interface RequestMessage { @@ -36,6 +37,15 @@ export interface LLMAgentConfig { useTools?: (string | undefined)[]; } +export interface SpeechOptions { + model: string; + input: string; + voice: string; + response_format?: string; + speed?: number; + onController?: (controller: AbortController) => void; +} + export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -76,6 +86,7 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; + abstract speech(options: SpeechOptions): Promise; abstract toolAgentChat(options: AgentChatOptions): Promise; abstract usage(): Promise; abstract models(): Promise; diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index c19c7ece5..a0fe79ab8 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -11,6 +11,7 @@ import { LLMApi, LLMModel, LLMUsage, + SpeechOptions, } from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import axios from "axios"; @@ -22,6 +23,9 @@ const getImageBase64Data = async (url: string) => { }; export class GeminiProApi implements LLMApi { + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } toolAgentChat(options: AgentChatOptions): Promise { throw new Error("Method not implemented."); } diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index 3decfcf5a..eeeac31c9 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -16,6 +16,7 @@ import { LLMApi, LLMModel, LLMUsage, + SpeechOptions, } from "../api"; import Locale from "../../locales"; import { @@ -80,6 +81,44 @@ export class ChatGPTApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + async speech(options: SpeechOptions): Promise { + const requestPayload = { + model: options.model, + input: options.input, + voice: options.voice, + response_format: options.response_format, + speed: options.speed, + }; + + console.log("[Request] openai speech payload: ", requestPayload); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const speechPath = this.path(OpenaiPath.SpeechPath, options.model); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: getHeaders(), + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + + const res = await fetch(speechPath, speechPayload); + clearTimeout(requestTimeoutId); + return await res.arrayBuffer(); + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } + } + async chat(options: ChatOptions) { const messages: any[] = []; diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 5ff0cd015..5ce13de93 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -14,6 +14,8 @@ import RenameIcon from "../icons/rename.svg"; import ExportIcon from "../icons/share.svg"; import ReturnIcon from "../icons/return.svg"; import CopyIcon from "../icons/copy.svg"; +import SpeakIcon from "../icons/speak.svg"; +import SpeakStopIcon from "../icons/speak-stop.svg"; import LoadingIcon from "../icons/three-dots.svg"; import PromptIcon from "../icons/prompt.svg"; import MaskIcon from "../icons/mask.svg"; @@ -83,6 +85,7 @@ import { CHAT_PAGE_SIZE, LAST_INPUT_IMAGE_KEY, LAST_INPUT_KEY, + ModelProvider, Path, REQUEST_TIMEOUT_MS, UNFINISHED_INPUT, @@ -97,6 +100,9 @@ import { getClientConfig } from "../config/client"; import { useAllModels } from "../utils/hooks"; import Image from "next/image"; import { ClientApi } from "../client/api"; +import { createTTSPlayer } from "../utils/audio"; + +const ttsPlayer = createTTSPlayer(); const Markdown = dynamic(async () => (await import("./markdown")).Markdown, { loading: () => , @@ -1008,6 +1014,37 @@ function _Chat() { }); }; + const [speechStatus, setSpeechStatus] = useState(false); + const [speechLoading, setSpeechLoading] = useState(false); + async function openaiSpeech(text: string) { + if (speechStatus) { + ttsPlayer.stop(); + setSpeechStatus(false); + } else { + var api: ClientApi; + api = new ClientApi(ModelProvider.GPT); + const config = useAppConfig.getState(); + setSpeechLoading(true); + const audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: text, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); @@ -1361,6 +1398,24 @@ function _Chat() { icon={} onClick={() => copyToClipboard(message.content)} /> + {config.ttsConfig.enable && ( + + ) : ( + + ) + } + onClick={() => openaiSpeech(message.content)} + /> + )} )} diff --git a/app/components/settings.tsx b/app/components/settings.tsx index 69c5556d2..1ffe618d8 100644 --- a/app/components/settings.tsx +++ b/app/components/settings.tsx @@ -72,6 +72,7 @@ import { nanoid } from "nanoid"; import { PluginConfigList } from "./plugin-config"; import { useMaskStore } from "../store/mask"; import { ProviderType } from "../utils/cloud"; +import { TTSConfigList } from "./tts-config"; function EditPromptModal(props: { id: string; onClose: () => void }) { const promptStore = usePromptStore(); @@ -1198,6 +1199,17 @@ export function Settings() { /> + + { + const ttsConfig = { ...config.ttsConfig }; + updater(ttsConfig); + config.update((config) => (config.ttsConfig = ttsConfig)); + }} + /> + + diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx new file mode 100644 index 000000000..2fff433c6 --- /dev/null +++ b/app/components/tts-config.tsx @@ -0,0 +1,104 @@ +import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store"; + +import Locale from "../locales"; +import { ListItem, Select } from "./ui-lib"; +import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant"; +import { InputRange } from "./input-range"; + +export function TTSConfigList(props: { + ttsConfig: TTSConfig; + updateConfig: (updater: (config: TTSConfig) => void) => void; +}) { + return ( + <> + + + props.updateConfig( + (config) => (config.enable = e.currentTarget.checked), + ) + } + > + + {/* + + props.updateConfig( + (config) => (config.autoplay = e.currentTarget.checked), + ) + } + > + */} + + + + + + + + { + props.updateConfig( + (config) => + (config.speed = TTSConfigValidator.speed( + e.currentTarget.valueAsNumber, + )), + ); + }} + > + + + ); +} diff --git a/app/components/tts.module.scss b/app/components/tts.module.scss new file mode 100644 index 000000000..ba9f382e4 --- /dev/null +++ b/app/components/tts.module.scss @@ -0,0 +1,119 @@ +@import "../styles/animation.scss"; +.plugin-page { + height: 100%; + display: flex; + flex-direction: column; + + .plugin-page-body { + padding: 20px; + overflow-y: auto; + + .plugin-filter { + width: 100%; + max-width: 100%; + margin-bottom: 20px; + animation: slide-in ease 0.3s; + height: 40px; + + display: flex; + + .search-bar { + flex-grow: 1; + max-width: 100%; + min-width: 0; + outline: none; + } + + .search-bar:focus { + border: 1px solid var(--primary); + } + + .plugin-filter-lang { + height: 100%; + margin-left: 10px; + } + + .plugin-create { + height: 100%; + margin-left: 10px; + box-sizing: border-box; + min-width: 80px; + } + } + + .plugin-item { + display: flex; + justify-content: space-between; + padding: 20px; + border: var(--border-in-light); + animation: slide-in ease 0.3s; + + &:not(:last-child) { + border-bottom: 0; + } + + &:first-child { + border-top-left-radius: 10px; + border-top-right-radius: 10px; + } + + &:last-child { + border-bottom-left-radius: 10px; + border-bottom-right-radius: 10px; + } + + .plugin-header { + display: flex; + align-items: center; + + .plugin-icon { + display: flex; + align-items: center; + justify-content: center; + margin-right: 10px; + } + + .plugin-title { + .plugin-name { + font-size: 14px; + font-weight: bold; + } + .plugin-info { + font-size: 12px; + } + .plugin-runtime-warning { + font-size: 12px; + color: #f86c6c; + } + } + } + + .plugin-actions { + display: flex; + flex-wrap: nowrap; + transition: all ease 0.3s; + justify-content: center; + align-items: center; + } + + @media screen and (max-width: 600px) { + display: flex; + flex-direction: column; + padding-bottom: 10px; + border-radius: 10px; + margin-bottom: 20px; + box-shadow: var(--card-shadow); + + &:not(:last-child) { + border-bottom: var(--border-in-light); + } + + .plugin-actions { + width: 100%; + justify-content: space-between; + padding-top: 10px; + } + } + } + } +} diff --git a/app/constant.ts b/app/constant.ts index f76cebbf0..0e5ecea9d 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -80,6 +80,7 @@ export enum ModelProvider { export const OpenaiPath = { ChatPath: "v1/chat/completions", + SpeechPath: "v1/audio/speech", UsagePath: "dashboard/billing/usage", SubsPath: "dashboard/billing/subscription", ListModelPath: "v1/models", @@ -118,6 +119,18 @@ export const KnowledgeCutOffDate: Record = { "gemini-pro": "2023-12", }; +export const DEFAULT_TTS_MODEL = "tts-1"; +export const DEFAULT_TTS_VOICE = "alloy"; +export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"]; +export const DEFAULT_TTS_VOICES = [ + "alloy", + "echo", + "fable", + "onyx", + "nova", + "shimmer", +]; + export const DEFAULT_MODELS = [ { name: "gpt-4", diff --git a/app/icons/speak-stop.svg b/app/icons/speak-stop.svg new file mode 100644 index 000000000..926ae7bb3 --- /dev/null +++ b/app/icons/speak-stop.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/speak.svg b/app/icons/speak.svg new file mode 100644 index 000000000..e02212c9a --- /dev/null +++ b/app/icons/speak.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/locales/cn.ts b/app/locales/cn.ts index e8368f89e..90d4100f7 100644 --- a/app/locales/cn.ts +++ b/app/locales/cn.ts @@ -42,6 +42,8 @@ const cn = { PinToastAction: "查看", Delete: "删除", Edit: "编辑", + Speech: "播放", + StopSpeech: "停止", }, Commands: { new: "新建聊天", @@ -373,6 +375,25 @@ const cn = { SubTitle: "是否返回插件调用的中间步骤", }, }, + TTS: { + Enable: { + Title: "启用文本转语音", + SubTitle: "启用基于 OpenAI 的文本生成语音服务", + }, + Autoplay: { + Title: "启用自动播放", + SubTitle: "自动生成语音并播放,需先开启文本转语音开关", + }, + Model: "模型", + Voice: { + Title: "声音", + SubTitle: "生成语音时使用的声音", + }, + Speed: { + Title: "速度", + SubTitle: "生成语音的速度", + }, + }, }, Store: { DefaultTopic: "新的聊天", diff --git a/app/locales/en.ts b/app/locales/en.ts index 1ea425087..bd0eb8f8b 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -44,6 +44,8 @@ const en: LocaleType = { PinToastAction: "View", Delete: "Delete", Edit: "Edit", + Speech: "Play", + StopSpeech: "Stop", }, Commands: { new: "Start a new chat", @@ -379,6 +381,26 @@ const en: LocaleType = { SubTitle: "Return Intermediate Steps", }, }, + TTS: { + Enable: { + Title: "Enable TTS", + SubTitle: "Enable text-to-speech service based on OpenAI", + }, + Autoplay: { + Title: "Enable Autoplay", + SubTitle: + "Automatically generate speech and play, you need to enable the text-to-speech switch first", + }, + Model: "Model", + Voice: { + Title: "Voice", + SubTitle: "The voice to use when generating the audio", + }, + Speed: { + Title: "Speed", + SubTitle: "The speed of the generated audio", + }, + }, }, Store: { DefaultTopic: "New Conversation", diff --git a/app/store/config.ts b/app/store/config.ts index ee5100d32..1939642d2 100644 --- a/app/store/config.ts +++ b/app/store/config.ts @@ -5,11 +5,17 @@ import { DEFAULT_INPUT_TEMPLATE, DEFAULT_MODELS, DEFAULT_SIDEBAR_WIDTH, + DEFAULT_TTS_MODEL, + DEFAULT_TTS_MODELS, + DEFAULT_TTS_VOICE, + DEFAULT_TTS_VOICES, StoreKey, } from "../constant"; import { createPersistStore } from "../utils/store"; export type ModelType = (typeof DEFAULT_MODELS)[number]["name"]; +export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; +export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; export enum SubmitKey { Enter = "Enter", @@ -64,12 +70,21 @@ export const DEFAULT_CONFIG = { maxIterations: 10, returnIntermediateSteps: true, }, + + ttsConfig: { + enable: false, + autoplay: false, + model: DEFAULT_TTS_MODEL, + voice: DEFAULT_TTS_VOICE, + speed: 1.0, + }, }; export type ChatConfig = typeof DEFAULT_CONFIG; export type ModelConfig = ChatConfig["modelConfig"]; export type PluginConfig = ChatConfig["pluginConfig"]; +export type TTSConfig = ChatConfig["ttsConfig"]; export function limitNumber( x: number, @@ -84,6 +99,18 @@ export function limitNumber( return Math.min(max, Math.max(min, x)); } +export const TTSConfigValidator = { + model(x: string) { + return x as TTSModelType; + }, + voice(x: string) { + return x as TTSVoiceType; + }, + speed(x: number) { + return limitNumber(x, 0.25, 4.0, 1.0); + }, +}; + export const ModalConfigValidator = { model(x: string) { return x as ModelType; diff --git a/app/utils/audio.ts b/app/utils/audio.ts new file mode 100644 index 000000000..953f789c1 --- /dev/null +++ b/app/utils/audio.ts @@ -0,0 +1,37 @@ +type TTSPlayer = { + play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise; + stop: () => void; +}; + +export function createTTSPlayer(): TTSPlayer { + let audioContext: AudioContext | null = null; + let audioBufferSourceNode: AudioBufferSourceNode | null = null; + + const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => { + if (audioBufferSourceNode) { + audioBufferSourceNode.stop(); + audioBufferSourceNode.disconnect(); + } + audioContext = new AudioContext(); + const buffer = await audioContext.decodeAudioData(audioBuffer); + audioBufferSourceNode = audioContext.createBufferSource(); + audioBufferSourceNode.buffer = buffer; + audioBufferSourceNode.connect(audioContext.destination); + audioBufferSourceNode.start(); + audioBufferSourceNode.onended = onended; + }; + + const stop = () => { + if (audioBufferSourceNode) { + audioBufferSourceNode.stop(); + audioBufferSourceNode.disconnect(); + audioBufferSourceNode = null; + } + if (audioContext) { + audioContext.close(); + audioContext = null; + } + }; + + return { play, stop }; +} diff --git a/package.json b/package.json index cc1f960ea..f9c719ed8 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "nanoid": "^5.0.3", "next": "^13.4.9", "node-fetch": "^3.3.1", - "openai": "^4.6.0", + "openai": "^4.28.4", "pdf-parse": "^1.1.1", "react": "^18.2.0", "react-dom": "^18.2.0", @@ -78,6 +78,7 @@ }, "resolutions": { "lint-staged/yaml": "^2.2.2", - "@langchain/core": "0.1.30" + "@langchain/core": "0.1.30", + "openai": "4.28.4" } } \ No newline at end of file diff --git a/yarn.lock b/yarn.lock index 48f3144b5..7c371bfc4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6497,10 +6497,10 @@ open@^8.4.0: is-docker "^2.1.1" is-wsl "^2.2.0" -openai@^4.26.0: - version "4.28.0" - resolved "https://registry.yarnpkg.com/openai/-/openai-4.28.0.tgz#ded00e3d98c25758b5406c9675ec27a957e00930" - integrity sha512-JM8fhcpmpGN0vrUwGquYIzdcEQHtFuom6sRCbbCM6CfzZXNuRk33G7KfeRAIfnaCxSpzrP5iHtwJzIm6biUZ2Q== +openai@4.28.4, openai@^4.26.0, openai@^4.28.4: + version "4.28.4" + resolved "https://registry.yarnpkg.com/openai/-/openai-4.28.4.tgz#d4bf1f53a89ef151bf066ef284489e12e7dd1657" + integrity sha512-RNIwx4MT/F0zyizGcwS+bXKLzJ8QE9IOyigDG/ttnwB220d58bYjYFp0qjvGwEFBO6+pvFVIDABZPGDl46RFsg== dependencies: "@types/node" "^18.11.18" "@types/node-fetch" "^2.6.4" @@ -6512,20 +6512,6 @@ openai@^4.26.0: node-fetch "^2.6.7" web-streams-polyfill "^3.2.1" -openai@^4.6.0: - version "4.12.1" - resolved "https://registry.yarnpkg.com/openai/-/openai-4.12.1.tgz#f1ef4283197cf2ef932abc55afeae8a2182d8fe6" - integrity sha512-EAoUwm4dtiWvFwBhOCK/VfF8sj1ZU8+aAIJnfT4NyeTfrt1DM/6Gdd6fOZWTjBYryTAqu9Vpb5+9Wu6JMtm/gA== - dependencies: - "@types/node" "^18.11.18" - "@types/node-fetch" "^2.6.4" - abort-controller "^3.0.0" - agentkeepalive "^4.2.1" - digest-fetch "^1.3.0" - form-data-encoder "1.7.2" - formdata-node "^4.3.2" - node-fetch "^2.6.7" - openapi-types@^12.1.3: version "12.1.3" resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3"