diff --git a/app/client/api.ts b/app/client/api.ts index 64ac82b2a..f5288593d 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -40,6 +40,11 @@ export interface MultimodalContent { }; } +export interface MultimodalContentForAlibaba { + text?: string; + image?: string; +} + export interface RequestMessage { role: MessageRole; content: string | MultimodalContent[]; diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 88511768c..4875e5c02 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -7,7 +7,10 @@ import { ChatMessageTool, usePluginStore, } from "@/app/store"; -import { streamWithThink } from "@/app/utils/chat"; +import { + preProcessImageContentForAlibabaDashScope, + streamWithThink, +} from "@/app/utils/chat"; import { ChatOptions, getHeaders, @@ -15,12 +18,14 @@ import { LLMModel, SpeechOptions, MultimodalContent, + MultimodalContentForAlibaba, } from "../api"; import { getClientConfig } from "@/app/config/client"; import { getMessageTextContent, getMessageTextContentWithoutThinking, getTimeoutMSByModel, + isVisionModel, } from "@/app/utils"; import { fetch } from "@/app/utils/stream"; @@ -89,14 +94,6 @@ export class QwenApi implements LLMApi { } async chat(options: ChatOptions) { - const messages = options.messages.map((v) => ({ - role: v.role, - content: - v.role === "assistant" - ? getMessageTextContentWithoutThinking(v) - : getMessageTextContent(v), - })); - const modelConfig = { ...useAppConfig.getState().modelConfig, ...useChatStore.getState().currentSession().mask.modelConfig, @@ -105,6 +102,21 @@ export class QwenApi implements LLMApi { }, }; + const visionModel = isVisionModel(options.config.model); + + const messages: ChatOptions["messages"] = []; + for (const v of options.messages) { + const content = ( + visionModel + ? await preProcessImageContentForAlibabaDashScope(v.content) + : v.role === "assistant" + ? getMessageTextContentWithoutThinking(v) + : getMessageTextContent(v) + ) as any; + + messages.push({ role: v.role, content }); + } + const shouldStream = !!options.config.stream; const requestPayload: RequestPayload = { model: modelConfig.model, @@ -129,7 +141,7 @@ export class QwenApi implements LLMApi { "X-DashScope-SSE": shouldStream ? "enable" : "disable", }; - const chatPath = this.path(Alibaba.ChatPath); + const chatPath = this.path(Alibaba.ChatPath(modelConfig.model)); const chatPayload = { method: "POST", body: JSON.stringify(requestPayload), @@ -162,7 +174,7 @@ export class QwenApi implements LLMApi { const json = JSON.parse(text); const choices = json.output.choices as Array<{ message: { - content: string | null; + content: string | null | MultimodalContentForAlibaba[]; tool_calls: ChatMessageTool[]; reasoning_content: string | null; }; @@ -212,7 +224,9 @@ export class QwenApi implements LLMApi { } else if (content && content.length > 0) { return { isThinking: false, - content: content, + content: Array.isArray(content) + ? content.map((item) => item.text).join(",") + : content, }; } diff --git a/app/constant.ts b/app/constant.ts index 50aaf7921..358467c63 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -221,7 +221,12 @@ export const ByteDance = { export const Alibaba = { ExampleEndpoint: ALIBABA_BASE_URL, - ChatPath: "v1/services/aigc/text-generation/generation", + ChatPath: (modelName: string) => { + if (modelName.includes("vl") || modelName.includes("omni")) { + return "v1/services/aigc/multimodal-generation/generation"; + } + return `v1/services/aigc/text-generation/generation`; + }, }; export const Tencent = { @@ -568,6 +573,9 @@ const alibabaModes = [ "qwen-max-0403", "qwen-max-0107", "qwen-max-longcontext", + "qwen-omni-turbo", + "qwen-vl-plus", + "qwen-vl-max", ]; const tencentModels = [ diff --git a/app/utils/chat.ts b/app/utils/chat.ts index efc496f2c..ecb2fa468 100644 --- a/app/utils/chat.ts +++ b/app/utils/chat.ts @@ -92,6 +92,28 @@ export async function preProcessImageContent( return result; } +export async function preProcessImageContentForAlibabaDashScope( + content: RequestMessage["content"], +) { + if (typeof content === "string") { + return content; + } + const result = []; + for (const part of content) { + if (part?.type == "image_url" && part?.image_url?.url) { + try { + const url = await cacheImageToBase64Image(part?.image_url?.url); + result.push({ image: url }); + } catch (error) { + console.error("Error processing image URL:", error); + } + } else { + result.push({ ...part }); + } + } + return result; +} + const imageCaches: Record = {}; export function cacheImageToBase64Image(imageUrl: string) { if (imageUrl.includes(CACHE_URL_PREFIX)) {