diff --git a/app/client/api.ts b/app/client/api.ts index 64ac82b2a..f5288593d 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -40,6 +40,11 @@ export interface MultimodalContent { }; } +export interface MultimodalContentForAlibaba { + text?: string; + image?: string; +} + export interface RequestMessage { role: MessageRole; content: string | MultimodalContent[]; diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 88511768c..4875e5c02 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -7,7 +7,10 @@ import { ChatMessageTool, usePluginStore, } from "@/app/store"; -import { streamWithThink } from "@/app/utils/chat"; +import { + preProcessImageContentForAlibabaDashScope, + streamWithThink, +} from "@/app/utils/chat"; import { ChatOptions, getHeaders, @@ -15,12 +18,14 @@ import { LLMModel, SpeechOptions, MultimodalContent, + MultimodalContentForAlibaba, } from "../api"; import { getClientConfig } from "@/app/config/client"; import { getMessageTextContent, getMessageTextContentWithoutThinking, getTimeoutMSByModel, + isVisionModel, } from "@/app/utils"; import { fetch } from "@/app/utils/stream"; @@ -89,14 +94,6 @@ export class QwenApi implements LLMApi { } async chat(options: ChatOptions) { - const messages = options.messages.map((v) => ({ - role: v.role, - content: - v.role === "assistant" - ? getMessageTextContentWithoutThinking(v) - : getMessageTextContent(v), - })); - const modelConfig = { ...useAppConfig.getState().modelConfig, ...useChatStore.getState().currentSession().mask.modelConfig, @@ -105,6 +102,21 @@ export class QwenApi implements LLMApi { }, }; + const visionModel = isVisionModel(options.config.model); + + const messages: ChatOptions["messages"] = []; + for (const v of options.messages) { + const content = ( + visionModel + ? await preProcessImageContentForAlibabaDashScope(v.content) + : v.role === "assistant" + ? getMessageTextContentWithoutThinking(v) + : getMessageTextContent(v) + ) as any; + + messages.push({ role: v.role, content }); + } + const shouldStream = !!options.config.stream; const requestPayload: RequestPayload = { model: modelConfig.model, @@ -129,7 +141,7 @@ export class QwenApi implements LLMApi { "X-DashScope-SSE": shouldStream ? "enable" : "disable", }; - const chatPath = this.path(Alibaba.ChatPath); + const chatPath = this.path(Alibaba.ChatPath(modelConfig.model)); const chatPayload = { method: "POST", body: JSON.stringify(requestPayload), @@ -162,7 +174,7 @@ export class QwenApi implements LLMApi { const json = JSON.parse(text); const choices = json.output.choices as Array<{ message: { - content: string | null; + content: string | null | MultimodalContentForAlibaba[]; tool_calls: ChatMessageTool[]; reasoning_content: string | null; }; @@ -212,7 +224,9 @@ export class QwenApi implements LLMApi { } else if (content && content.length > 0) { return { isThinking: false, - content: content, + content: Array.isArray(content) + ? content.map((item) => item.text).join(",") + : content, }; } diff --git a/app/constant.ts b/app/constant.ts index 02ba8dc81..c1b135485 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -221,7 +221,12 @@ export const ByteDance = { export const Alibaba = { ExampleEndpoint: ALIBABA_BASE_URL, - ChatPath: "v1/services/aigc/text-generation/generation", + ChatPath: (modelName: string) => { + if (modelName.includes("vl") || modelName.includes("omni")) { + return "v1/services/aigc/multimodal-generation/generation"; + } + return `v1/services/aigc/text-generation/generation`; + }, }; export const Tencent = { @@ -570,6 +575,9 @@ const alibabaModes = [ "qwen-max-0403", "qwen-max-0107", "qwen-max-longcontext", + "qwen-omni-turbo", + "qwen-vl-plus", + "qwen-vl-max", ]; const tencentModels = [ diff --git a/app/utils/chat.ts b/app/utils/chat.ts index efc496f2c..cae775512 100644 --- a/app/utils/chat.ts +++ b/app/utils/chat.ts @@ -3,7 +3,7 @@ import { UPLOAD_URL, REQUEST_TIMEOUT_MS, } from "@/app/constant"; -import { RequestMessage } from "@/app/client/api"; +import { MultimodalContent, RequestMessage } from "@/app/client/api"; import Locale from "@/app/locales"; import { EventStreamContentType, @@ -70,8 +70,9 @@ export function compressImage(file: Blob, maxSize: number): Promise { }); } -export async function preProcessImageContent( +export async function preProcessImageContentBase( content: RequestMessage["content"], + transformImageUrl: (url: string) => Promise<{ [key: string]: any }>, ) { if (typeof content === "string") { return content; @@ -81,7 +82,7 @@ export async function preProcessImageContent( if (part?.type == "image_url" && part?.image_url?.url) { try { const url = await cacheImageToBase64Image(part?.image_url?.url); - result.push({ type: part.type, image_url: { url } }); + result.push(await transformImageUrl(url)); } catch (error) { console.error("Error processing image URL:", error); } @@ -92,6 +93,23 @@ export async function preProcessImageContent( return result; } +export async function preProcessImageContent( + content: RequestMessage["content"], +) { + return preProcessImageContentBase(content, async (url) => ({ + type: "image_url", + image_url: { url }, + })) as Promise; +} + +export async function preProcessImageContentForAlibabaDashScope( + content: RequestMessage["content"], +) { + return preProcessImageContentBase(content, async (url) => ({ + image: url, + })); +} + const imageCaches: Record = {}; export function cacheImageToBase64Image(imageUrl: string) { if (imageUrl.includes(CACHE_URL_PREFIX)) {