feat(alibaba): Added alibaba vision model and omni model support

This commit is contained in:
EvanWu 2025-02-24 20:18:07 +08:00
parent f5f3ce94f6
commit b709ee3983
4 changed files with 62 additions and 13 deletions

View File

@ -40,6 +40,11 @@ export interface MultimodalContent {
}; };
} }
export interface MultimodalContentForAlibaba {
text?: string;
image?: string;
}
export interface RequestMessage { export interface RequestMessage {
role: MessageRole; role: MessageRole;
content: string | MultimodalContent[]; content: string | MultimodalContent[];

View File

@ -7,7 +7,10 @@ import {
ChatMessageTool, ChatMessageTool,
usePluginStore, usePluginStore,
} from "@/app/store"; } from "@/app/store";
import { streamWithThink } from "@/app/utils/chat"; import {
preProcessImageContentForAlibabaDashScope,
streamWithThink,
} from "@/app/utils/chat";
import { import {
ChatOptions, ChatOptions,
getHeaders, getHeaders,
@ -15,12 +18,14 @@ import {
LLMModel, LLMModel,
SpeechOptions, SpeechOptions,
MultimodalContent, MultimodalContent,
MultimodalContentForAlibaba,
} from "../api"; } from "../api";
import { getClientConfig } from "@/app/config/client"; import { getClientConfig } from "@/app/config/client";
import { import {
getMessageTextContent, getMessageTextContent,
getMessageTextContentWithoutThinking, getMessageTextContentWithoutThinking,
getTimeoutMSByModel, getTimeoutMSByModel,
isVisionModel,
} from "@/app/utils"; } from "@/app/utils";
import { fetch } from "@/app/utils/stream"; import { fetch } from "@/app/utils/stream";
@ -89,14 +94,6 @@ export class QwenApi implements LLMApi {
} }
async chat(options: ChatOptions) { async chat(options: ChatOptions) {
const messages = options.messages.map((v) => ({
role: v.role,
content:
v.role === "assistant"
? getMessageTextContentWithoutThinking(v)
: getMessageTextContent(v),
}));
const modelConfig = { const modelConfig = {
...useAppConfig.getState().modelConfig, ...useAppConfig.getState().modelConfig,
...useChatStore.getState().currentSession().mask.modelConfig, ...useChatStore.getState().currentSession().mask.modelConfig,
@ -105,6 +102,21 @@ export class QwenApi implements LLMApi {
}, },
}; };
const visionModel = isVisionModel(options.config.model);
const messages: ChatOptions["messages"] = [];
for (const v of options.messages) {
const content = (
visionModel
? await preProcessImageContentForAlibabaDashScope(v.content)
: v.role === "assistant"
? getMessageTextContentWithoutThinking(v)
: getMessageTextContent(v)
) as any;
messages.push({ role: v.role, content });
}
const shouldStream = !!options.config.stream; const shouldStream = !!options.config.stream;
const requestPayload: RequestPayload = { const requestPayload: RequestPayload = {
model: modelConfig.model, model: modelConfig.model,
@ -129,7 +141,7 @@ export class QwenApi implements LLMApi {
"X-DashScope-SSE": shouldStream ? "enable" : "disable", "X-DashScope-SSE": shouldStream ? "enable" : "disable",
}; };
const chatPath = this.path(Alibaba.ChatPath); const chatPath = this.path(Alibaba.ChatPath(modelConfig.model));
const chatPayload = { const chatPayload = {
method: "POST", method: "POST",
body: JSON.stringify(requestPayload), body: JSON.stringify(requestPayload),
@ -162,7 +174,7 @@ export class QwenApi implements LLMApi {
const json = JSON.parse(text); const json = JSON.parse(text);
const choices = json.output.choices as Array<{ const choices = json.output.choices as Array<{
message: { message: {
content: string | null; content: string | null | MultimodalContentForAlibaba[];
tool_calls: ChatMessageTool[]; tool_calls: ChatMessageTool[];
reasoning_content: string | null; reasoning_content: string | null;
}; };
@ -212,7 +224,9 @@ export class QwenApi implements LLMApi {
} else if (content && content.length > 0) { } else if (content && content.length > 0) {
return { return {
isThinking: false, isThinking: false,
content: content, content: Array.isArray(content)
? content.map((item) => item.text).join(",")
: content,
}; };
} }

View File

@ -221,7 +221,12 @@ export const ByteDance = {
export const Alibaba = { export const Alibaba = {
ExampleEndpoint: ALIBABA_BASE_URL, ExampleEndpoint: ALIBABA_BASE_URL,
ChatPath: "v1/services/aigc/text-generation/generation", ChatPath: (modelName: string) => {
if (modelName.includes("vl") || modelName.includes("omni")) {
return "v1/services/aigc/multimodal-generation/generation";
}
return `v1/services/aigc/text-generation/generation`;
},
}; };
export const Tencent = { export const Tencent = {
@ -568,6 +573,9 @@ const alibabaModes = [
"qwen-max-0403", "qwen-max-0403",
"qwen-max-0107", "qwen-max-0107",
"qwen-max-longcontext", "qwen-max-longcontext",
"qwen-omni-turbo",
"qwen-vl-plus",
"qwen-vl-max",
]; ];
const tencentModels = [ const tencentModels = [

View File

@ -92,6 +92,28 @@ export async function preProcessImageContent(
return result; return result;
} }
export async function preProcessImageContentForAlibabaDashScope(
content: RequestMessage["content"],
) {
if (typeof content === "string") {
return content;
}
const result = [];
for (const part of content) {
if (part?.type == "image_url" && part?.image_url?.url) {
try {
const url = await cacheImageToBase64Image(part?.image_url?.url);
result.push({ image: url });
} catch (error) {
console.error("Error processing image URL:", error);
}
} else {
result.push({ ...part });
}
}
return result;
}
const imageCaches: Record<string, string> = {}; const imageCaches: Record<string, string> = {};
export function cacheImageToBase64Image(imageUrl: string) { export function cacheImageToBase64Image(imageUrl: string) {
if (imageUrl.includes(CACHE_URL_PREFIX)) { if (imageUrl.includes(CACHE_URL_PREFIX)) {