feat: api supports image input

2025-05-25 07:00:23 +09:00 · 2023-12-05 13:44:15 +08:00 · 2023-12-05 13:44:15 +08:00 · 7f3d261fb2
commit 7f3d261fb2
parent 4c46de7d1d
4 changed files with 40 additions and 11 deletions
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@ -74,10 +74,28 @@ export class ChatGPTApi implements LLMApi {
  }

  async chat(options: ChatOptions) {
-    const messages = options.messages.map((v) => ({
-      role: v.role,
-      content: v.content,
-    }));
+    const messages = options.messages.map((v) => {
+      let message: {
+        role: string;
+        content: { type: string; text?: string; image_url?: { url: string } }[];
+      } = {
+        role: v.role,
+        content: [],
+      };
+      message.content.push({
+        type: "text",
+        text: v.content,
+      });
+      if (v.image_url) {
+        message.content.push({
+          type: "image_url",
+          image_url: {
+            url: v.image_url,
+          },
+        });
+      }
+      return message;
+    });

    const modelConfig = {
      ...useAppConfig.getState().modelConfig,
@ -95,6 +113,10 @@ export class ChatGPTApi implements LLMApi {
      presence_penalty: modelConfig.presence_penalty,
      frequency_penalty: modelConfig.frequency_penalty,
      top_p: modelConfig.top_p,
+      max_tokens:
+        modelConfig.model == "gpt-4-vision-preview"
+          ? modelConfig.max_tokens
+          : null,
      // max_tokens: Math.max(modelConfig.max_tokens, 1024),
      // Please do not ask me why not send max_tokens, no reason, this param is just shit, I dont want to explain anymore.
    };
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -80,6 +80,7 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
+  LAST_INPUT_IMAGE_KEY,
  LAST_INPUT_KEY,
  Path,
  REQUEST_TIMEOUT_MS,
@ -554,7 +555,8 @@ export function ChatActions(props: {
        />

        {config.pluginConfig.enable &&
-          /^gpt(?!.*03\d{2}$).*$/.test(currentModel) && (
+          /^gpt(?!.*03\d{2}$).*$/.test(currentModel) &&
+          currentModel != "gpt-4-vision-preview" && (
            <ChatAction
              onClick={switchUsePlugins}
              text={
@ -778,8 +780,11 @@ function _Chat() {
      return;
    }
    setIsLoading(true);
-    chatStore.onUserInput(userInput).then(() => setIsLoading(false));
+    chatStore
+      .onUserInput(userInput, userImage.base64)
+      .then(() => setIsLoading(false));
    localStorage.setItem(LAST_INPUT_KEY, userInput);
+    localStorage.setItem(LAST_INPUT_IMAGE_KEY, userImage);
    setUserInput("");
    setPromptHints([]);
    setUserImage(null);
@ -847,6 +852,7 @@ function _Chat() {
      !(e.metaKey || e.altKey || e.ctrlKey)
    ) {
      setUserInput(localStorage.getItem(LAST_INPUT_KEY) ?? "");
+      setUserImage(localStorage.getItem(LAST_INPUT_IMAGE_KEY));
      e.preventDefault();
      return;
    }
@ -1331,7 +1337,7 @@ function _Chat() {
                  )}
                  <div className={styles["chat-message-item"]}>
                    <Markdown
-                      imageBase64={isUser && userImage && userImage.base64}
+                      imageBase64={message.image_url}
                      content={message.content}
                      loading={
                        (message.preview || message.streaming) &&
--- a/app/constant.ts
+++ b/app/constant.ts
@ -57,6 +57,7 @@ export const NARROW_SIDEBAR_WIDTH = 100;
 export const ACCESS_CODE_PREFIX = "nk-";

 export const LAST_INPUT_KEY = "last-input";
+export const LAST_INPUT_IMAGE_KEY = "last-input-image";
 export const UNFINISHED_INPUT = (id: string) => "unfinished-input-" + id;

 export const STORAGE_KEY = "chatgpt-next-web";
--- a/app/store/chat.ts
+++ b/app/store/chat.ts
@ -274,7 +274,7 @@ export const useChatStore = createPersistStore(
        get().summarizeSession();
      },

-      async onUserInput(content: string) {
+      async onUserInput(content: string, image_url?: string) {
        const session = get().currentSession();
        const modelConfig = session.mask.modelConfig;

@ -284,8 +284,8 @@ export const useChatStore = createPersistStore(
        const userMessage: ChatMessage = createMessage({
          role: "user",
          content: userContent,
+          image_url: image_url,
        });
-
        const botMessage: ChatMessage = createMessage({
          role: "assistant",
          streaming: true,
@ -319,11 +319,11 @@ export const useChatStore = createPersistStore(
          session.messages.push(savedUserMessage);
          session.messages.push(botMessage);
        });
-
        if (
          config.pluginConfig.enable &&
          session.mask.usePlugins &&
-          allPlugins.length > 0
+          allPlugins.length > 0 &&
+          modelConfig.model != "gpt-4-vision-preview"
        ) {
          console.log("[ToolAgent] start");
          const pluginToolNames = allPlugins.map((m) => m.toolName);