feat: support openai tts

This commit is contained in:
Hk-Gosuto 2024-03-03 15:15:23 +08:00
parent d2733a9128
commit c3656609ab
17 changed files with 475 additions and 20 deletions

View File

@ -34,6 +34,8 @@
- 除插件工具外,与原项目保持一致 [ChatGPT-Next-Web 主要功能](https://github.com/Yidadaa/ChatGPT-Next-Web#主要功能)
- 支持 OpenAI TTS文本转语音https://github.com/Hk-Gosuto/ChatGPT-Next-Web-LangChain/issues/208
- 支持 GPT-4V(视觉) 模型
- 需要配置对象存储服务,请参考 [对象存储服务配置指南](./docs/s3-oss.md) 配置

View File

@ -13,6 +13,7 @@ export const ROLES = ["system", "user", "assistant"] as const;
export type MessageRole = (typeof ROLES)[number];
export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
export const TTSModels = ["tts-1", "tts-1-hd"] as const;
export type ChatModel = ModelType;
export interface RequestMessage {
@ -36,6 +37,15 @@ export interface LLMAgentConfig {
useTools?: (string | undefined)[];
}
export interface SpeechOptions {
model: string;
input: string;
voice: string;
response_format?: string;
speed?: number;
onController?: (controller: AbortController) => void;
}
export interface ChatOptions {
messages: RequestMessage[];
config: LLMConfig;
@ -76,6 +86,7 @@ export interface LLMModelProvider {
export abstract class LLMApi {
abstract chat(options: ChatOptions): Promise<void>;
abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
abstract toolAgentChat(options: AgentChatOptions): Promise<void>;
abstract usage(): Promise<LLMUsage>;
abstract models(): Promise<LLMModel[]>;

View File

@ -11,6 +11,7 @@ import {
LLMApi,
LLMModel,
LLMUsage,
SpeechOptions,
} from "../api";
import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
import axios from "axios";
@ -22,6 +23,9 @@ const getImageBase64Data = async (url: string) => {
};
export class GeminiProApi implements LLMApi {
speech(options: SpeechOptions): Promise<ArrayBuffer> {
throw new Error("Method not implemented.");
}
toolAgentChat(options: AgentChatOptions): Promise<void> {
throw new Error("Method not implemented.");
}

View File

@ -16,6 +16,7 @@ import {
LLMApi,
LLMModel,
LLMUsage,
SpeechOptions,
} from "../api";
import Locale from "../../locales";
import {
@ -80,6 +81,44 @@ export class ChatGPTApi implements LLMApi {
return res.choices?.at(0)?.message?.content ?? "";
}
async speech(options: SpeechOptions): Promise<ArrayBuffer> {
const requestPayload = {
model: options.model,
input: options.input,
voice: options.voice,
response_format: options.response_format,
speed: options.speed,
};
console.log("[Request] openai speech payload: ", requestPayload);
const controller = new AbortController();
options.onController?.(controller);
try {
const speechPath = this.path(OpenaiPath.SpeechPath, options.model);
const speechPayload = {
method: "POST",
body: JSON.stringify(requestPayload),
signal: controller.signal,
headers: getHeaders(),
};
// make a fetch request
const requestTimeoutId = setTimeout(
() => controller.abort(),
REQUEST_TIMEOUT_MS,
);
const res = await fetch(speechPath, speechPayload);
clearTimeout(requestTimeoutId);
return await res.arrayBuffer();
} catch (e) {
console.log("[Request] failed to make a speech request", e);
throw e;
}
}
async chat(options: ChatOptions) {
const messages: any[] = [];

View File

@ -14,6 +14,8 @@ import RenameIcon from "../icons/rename.svg";
import ExportIcon from "../icons/share.svg";
import ReturnIcon from "../icons/return.svg";
import CopyIcon from "../icons/copy.svg";
import SpeakIcon from "../icons/speak.svg";
import SpeakStopIcon from "../icons/speak-stop.svg";
import LoadingIcon from "../icons/three-dots.svg";
import PromptIcon from "../icons/prompt.svg";
import MaskIcon from "../icons/mask.svg";
@ -83,6 +85,7 @@ import {
CHAT_PAGE_SIZE,
LAST_INPUT_IMAGE_KEY,
LAST_INPUT_KEY,
ModelProvider,
Path,
REQUEST_TIMEOUT_MS,
UNFINISHED_INPUT,
@ -97,6 +100,9 @@ import { getClientConfig } from "../config/client";
import { useAllModels } from "../utils/hooks";
import Image from "next/image";
import { ClientApi } from "../client/api";
import { createTTSPlayer } from "../utils/audio";
const ttsPlayer = createTTSPlayer();
const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
loading: () => <LoadingIcon />,
@ -1008,6 +1014,37 @@ function _Chat() {
});
};
const [speechStatus, setSpeechStatus] = useState(false);
const [speechLoading, setSpeechLoading] = useState(false);
async function openaiSpeech(text: string) {
if (speechStatus) {
ttsPlayer.stop();
setSpeechStatus(false);
} else {
var api: ClientApi;
api = new ClientApi(ModelProvider.GPT);
const config = useAppConfig.getState();
setSpeechLoading(true);
const audioBuffer = await api.llm.speech({
model: config.ttsConfig.model,
input: text,
voice: config.ttsConfig.voice,
speed: config.ttsConfig.speed,
});
setSpeechStatus(true);
ttsPlayer
.play(audioBuffer, () => {
setSpeechStatus(false);
})
.catch((e) => {
console.error("[OpenAI Speech]", e);
showToast(prettyObject(e));
setSpeechStatus(false);
})
.finally(() => setSpeechLoading(false));
}
}
const context: RenderMessage[] = useMemo(() => {
return session.mask.hideContext ? [] : session.mask.context.slice();
}, [session.mask.context, session.mask.hideContext]);
@ -1361,6 +1398,24 @@ function _Chat() {
icon={<CopyIcon />}
onClick={() => copyToClipboard(message.content)}
/>
{config.ttsConfig.enable && (
<ChatAction
text={
speechStatus
? Locale.Chat.Actions.StopSpeech
: Locale.Chat.Actions.Speech
}
loding={speechLoading}
icon={
speechStatus ? (
<SpeakStopIcon />
) : (
<SpeakIcon />
)
}
onClick={() => openaiSpeech(message.content)}
/>
)}
</>
)}
</div>

View File

@ -72,6 +72,7 @@ import { nanoid } from "nanoid";
import { PluginConfigList } from "./plugin-config";
import { useMaskStore } from "../store/mask";
import { ProviderType } from "../utils/cloud";
import { TTSConfigList } from "./tts-config";
function EditPromptModal(props: { id: string; onClose: () => void }) {
const promptStore = usePromptStore();
@ -1198,6 +1199,17 @@ export function Settings() {
/>
</List>
<List>
<TTSConfigList
ttsConfig={config.ttsConfig}
updateConfig={(updater) => {
const ttsConfig = { ...config.ttsConfig };
updater(ttsConfig);
config.update((config) => (config.ttsConfig = ttsConfig));
}}
/>
</List>
<DangerItems />
</div>
</ErrorBoundary>

View File

@ -0,0 +1,104 @@
import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
import Locale from "../locales";
import { ListItem, Select } from "./ui-lib";
import { DEFAULT_TTS_MODELS, DEFAULT_TTS_VOICES } from "../constant";
import { InputRange } from "./input-range";
export function TTSConfigList(props: {
ttsConfig: TTSConfig;
updateConfig: (updater: (config: TTSConfig) => void) => void;
}) {
return (
<>
<ListItem
title={Locale.Settings.TTS.Enable.Title}
subTitle={Locale.Settings.TTS.Enable.SubTitle}
>
<input
type="checkbox"
checked={props.ttsConfig.enable}
onChange={(e) =>
props.updateConfig(
(config) => (config.enable = e.currentTarget.checked),
)
}
></input>
</ListItem>
{/* <ListItem
title={Locale.Settings.TTS.Autoplay.Title}
subTitle={Locale.Settings.TTS.Autoplay.SubTitle}
>
<input
type="checkbox"
checked={props.ttsConfig.autoplay}
onChange={(e) =>
props.updateConfig(
(config) => (config.autoplay = e.currentTarget.checked),
)
}
></input>
</ListItem> */}
<ListItem title={Locale.Settings.TTS.Model}>
<Select
value={props.ttsConfig.model}
onChange={(e) => {
props.updateConfig(
(config) =>
(config.model = TTSConfigValidator.model(
e.currentTarget.value,
)),
);
}}
>
{DEFAULT_TTS_MODELS.map((v, i) => (
<option value={v} key={i}>
{v}
</option>
))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.TTS.Voice.Title}
subTitle={Locale.Settings.TTS.Voice.SubTitle}
>
<Select
value={props.ttsConfig.voice}
onChange={(e) => {
props.updateConfig(
(config) =>
(config.voice = TTSConfigValidator.voice(
e.currentTarget.value,
)),
);
}}
>
{DEFAULT_TTS_VOICES.map((v, i) => (
<option value={v} key={i}>
{v}
</option>
))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.TTS.Speed.Title}
subTitle={Locale.Settings.TTS.Speed.SubTitle}
>
<InputRange
value={props.ttsConfig.speed?.toFixed(1)}
min="0.3"
max="4.0"
step="0.1"
onChange={(e) => {
props.updateConfig(
(config) =>
(config.speed = TTSConfigValidator.speed(
e.currentTarget.valueAsNumber,
)),
);
}}
></InputRange>
</ListItem>
</>
);
}

View File

@ -0,0 +1,119 @@
@import "../styles/animation.scss";
.plugin-page {
height: 100%;
display: flex;
flex-direction: column;
.plugin-page-body {
padding: 20px;
overflow-y: auto;
.plugin-filter {
width: 100%;
max-width: 100%;
margin-bottom: 20px;
animation: slide-in ease 0.3s;
height: 40px;
display: flex;
.search-bar {
flex-grow: 1;
max-width: 100%;
min-width: 0;
outline: none;
}
.search-bar:focus {
border: 1px solid var(--primary);
}
.plugin-filter-lang {
height: 100%;
margin-left: 10px;
}
.plugin-create {
height: 100%;
margin-left: 10px;
box-sizing: border-box;
min-width: 80px;
}
}
.plugin-item {
display: flex;
justify-content: space-between;
padding: 20px;
border: var(--border-in-light);
animation: slide-in ease 0.3s;
&:not(:last-child) {
border-bottom: 0;
}
&:first-child {
border-top-left-radius: 10px;
border-top-right-radius: 10px;
}
&:last-child {
border-bottom-left-radius: 10px;
border-bottom-right-radius: 10px;
}
.plugin-header {
display: flex;
align-items: center;
.plugin-icon {
display: flex;
align-items: center;
justify-content: center;
margin-right: 10px;
}
.plugin-title {
.plugin-name {
font-size: 14px;
font-weight: bold;
}
.plugin-info {
font-size: 12px;
}
.plugin-runtime-warning {
font-size: 12px;
color: #f86c6c;
}
}
}
.plugin-actions {
display: flex;
flex-wrap: nowrap;
transition: all ease 0.3s;
justify-content: center;
align-items: center;
}
@media screen and (max-width: 600px) {
display: flex;
flex-direction: column;
padding-bottom: 10px;
border-radius: 10px;
margin-bottom: 20px;
box-shadow: var(--card-shadow);
&:not(:last-child) {
border-bottom: var(--border-in-light);
}
.plugin-actions {
width: 100%;
justify-content: space-between;
padding-top: 10px;
}
}
}
}
}

View File

@ -80,6 +80,7 @@ export enum ModelProvider {
export const OpenaiPath = {
ChatPath: "v1/chat/completions",
SpeechPath: "v1/audio/speech",
UsagePath: "dashboard/billing/usage",
SubsPath: "dashboard/billing/subscription",
ListModelPath: "v1/models",
@ -118,6 +119,18 @@ export const KnowledgeCutOffDate: Record<string, string> = {
"gemini-pro": "2023-12",
};
export const DEFAULT_TTS_MODEL = "tts-1";
export const DEFAULT_TTS_VOICE = "alloy";
export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
export const DEFAULT_TTS_VOICES = [
"alloy",
"echo",
"fable",
"onyx",
"nova",
"shimmer",
];
export const DEFAULT_MODELS = [
{
name: "gpt-4",

1
app/icons/speak-stop.svg Normal file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M17.25 9.75 19.5 12m0 0 2.25 2.25M19.5 12l2.25-2.25M19.5 12l-2.25 2.25m-10.5-6 4.72-4.72a.75.75 0 0 1 1.28.53v15.88a.75.75 0 0 1-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.009 9.009 0 0 1 2.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75Z"></path></svg>

After

Width:  |  Height:  |  Size: 495 B

1
app/icons/speak.svg Normal file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M19.114 5.636a9 9 0 010 12.728M16.463 8.288a5.25 5.25 0 010 7.424M6.75 8.25l4.72-4.72a.75.75 0 011.28.53v15.88a.75.75 0 01-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.01 9.01 0 012.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75z"></path></svg>

After

Width:  |  Height:  |  Size: 485 B

View File

@ -42,6 +42,8 @@ const cn = {
PinToastAction: "查看",
Delete: "删除",
Edit: "编辑",
Speech: "播放",
StopSpeech: "停止",
},
Commands: {
new: "新建聊天",
@ -373,6 +375,25 @@ const cn = {
SubTitle: "是否返回插件调用的中间步骤",
},
},
TTS: {
Enable: {
Title: "启用文本转语音",
SubTitle: "启用基于 OpenAI 的文本生成语音服务",
},
Autoplay: {
Title: "启用自动播放",
SubTitle: "自动生成语音并播放,需先开启文本转语音开关",
},
Model: "模型",
Voice: {
Title: "声音",
SubTitle: "生成语音时使用的声音",
},
Speed: {
Title: "速度",
SubTitle: "生成语音的速度",
},
},
},
Store: {
DefaultTopic: "新的聊天",

View File

@ -44,6 +44,8 @@ const en: LocaleType = {
PinToastAction: "View",
Delete: "Delete",
Edit: "Edit",
Speech: "Play",
StopSpeech: "Stop",
},
Commands: {
new: "Start a new chat",
@ -379,6 +381,26 @@ const en: LocaleType = {
SubTitle: "Return Intermediate Steps",
},
},
TTS: {
Enable: {
Title: "Enable TTS",
SubTitle: "Enable text-to-speech service based on OpenAI",
},
Autoplay: {
Title: "Enable Autoplay",
SubTitle:
"Automatically generate speech and play, you need to enable the text-to-speech switch first",
},
Model: "Model",
Voice: {
Title: "Voice",
SubTitle: "The voice to use when generating the audio",
},
Speed: {
Title: "Speed",
SubTitle: "The speed of the generated audio",
},
},
},
Store: {
DefaultTopic: "New Conversation",

View File

@ -5,11 +5,17 @@ import {
DEFAULT_INPUT_TEMPLATE,
DEFAULT_MODELS,
DEFAULT_SIDEBAR_WIDTH,
DEFAULT_TTS_MODEL,
DEFAULT_TTS_MODELS,
DEFAULT_TTS_VOICE,
DEFAULT_TTS_VOICES,
StoreKey,
} from "../constant";
import { createPersistStore } from "../utils/store";
export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
export enum SubmitKey {
Enter = "Enter",
@ -64,12 +70,21 @@ export const DEFAULT_CONFIG = {
maxIterations: 10,
returnIntermediateSteps: true,
},
ttsConfig: {
enable: false,
autoplay: false,
model: DEFAULT_TTS_MODEL,
voice: DEFAULT_TTS_VOICE,
speed: 1.0,
},
};
export type ChatConfig = typeof DEFAULT_CONFIG;
export type ModelConfig = ChatConfig["modelConfig"];
export type PluginConfig = ChatConfig["pluginConfig"];
export type TTSConfig = ChatConfig["ttsConfig"];
export function limitNumber(
x: number,
@ -84,6 +99,18 @@ export function limitNumber(
return Math.min(max, Math.max(min, x));
}
export const TTSConfigValidator = {
model(x: string) {
return x as TTSModelType;
},
voice(x: string) {
return x as TTSVoiceType;
},
speed(x: number) {
return limitNumber(x, 0.25, 4.0, 1.0);
},
};
export const ModalConfigValidator = {
model(x: string) {
return x as ModelType;

37
app/utils/audio.ts Normal file
View File

@ -0,0 +1,37 @@
type TTSPlayer = {
play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
stop: () => void;
};
export function createTTSPlayer(): TTSPlayer {
let audioContext: AudioContext | null = null;
let audioBufferSourceNode: AudioBufferSourceNode | null = null;
const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
if (audioBufferSourceNode) {
audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect();
}
audioContext = new AudioContext();
const buffer = await audioContext.decodeAudioData(audioBuffer);
audioBufferSourceNode = audioContext.createBufferSource();
audioBufferSourceNode.buffer = buffer;
audioBufferSourceNode.connect(audioContext.destination);
audioBufferSourceNode.start();
audioBufferSourceNode.onended = onended;
};
const stop = () => {
if (audioBufferSourceNode) {
audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect();
audioBufferSourceNode = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
};
return { play, stop };
}

View File

@ -41,7 +41,7 @@
"nanoid": "^5.0.3",
"next": "^13.4.9",
"node-fetch": "^3.3.1",
"openai": "^4.6.0",
"openai": "^4.28.4",
"pdf-parse": "^1.1.1",
"react": "^18.2.0",
"react-dom": "^18.2.0",
@ -78,6 +78,7 @@
},
"resolutions": {
"lint-staged/yaml": "^2.2.2",
"@langchain/core": "0.1.30"
"@langchain/core": "0.1.30",
"openai": "4.28.4"
}
}

View File

@ -6497,10 +6497,10 @@ open@^8.4.0:
is-docker "^2.1.1"
is-wsl "^2.2.0"
openai@^4.26.0:
version "4.28.0"
resolved "https://registry.yarnpkg.com/openai/-/openai-4.28.0.tgz#ded00e3d98c25758b5406c9675ec27a957e00930"
integrity sha512-JM8fhcpmpGN0vrUwGquYIzdcEQHtFuom6sRCbbCM6CfzZXNuRk33G7KfeRAIfnaCxSpzrP5iHtwJzIm6biUZ2Q==
openai@4.28.4, openai@^4.26.0, openai@^4.28.4:
version "4.28.4"
resolved "https://registry.yarnpkg.com/openai/-/openai-4.28.4.tgz#d4bf1f53a89ef151bf066ef284489e12e7dd1657"
integrity sha512-RNIwx4MT/F0zyizGcwS+bXKLzJ8QE9IOyigDG/ttnwB220d58bYjYFp0qjvGwEFBO6+pvFVIDABZPGDl46RFsg==
dependencies:
"@types/node" "^18.11.18"
"@types/node-fetch" "^2.6.4"
@ -6512,20 +6512,6 @@ openai@^4.26.0:
node-fetch "^2.6.7"
web-streams-polyfill "^3.2.1"
openai@^4.6.0:
version "4.12.1"
resolved "https://registry.yarnpkg.com/openai/-/openai-4.12.1.tgz#f1ef4283197cf2ef932abc55afeae8a2182d8fe6"
integrity sha512-EAoUwm4dtiWvFwBhOCK/VfF8sj1ZU8+aAIJnfT4NyeTfrt1DM/6Gdd6fOZWTjBYryTAqu9Vpb5+9Wu6JMtm/gA==
dependencies:
"@types/node" "^18.11.18"
"@types/node-fetch" "^2.6.4"
abort-controller "^3.0.0"
agentkeepalive "^4.2.1"
digest-fetch "^1.3.0"
form-data-encoder "1.7.2"
formdata-node "^4.3.2"
node-fetch "^2.6.7"
openapi-types@^12.1.3:
version "12.1.3"
resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3"