mirror of
https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web.git
synced 2025-05-22 21:50:16 +09:00
146 lines
4.1 KiB
TypeScript
146 lines
4.1 KiB
TypeScript
import axiosMod, { AxiosStatic } from "axios";
|
|
import { WebPDFLoader } from "langchain/document_loaders/web/pdf";
|
|
import { MemoryVectorStore } from "langchain/vectorstores/memory";
|
|
import { Tool } from "@langchain/core/tools";
|
|
import {
|
|
RecursiveCharacterTextSplitter,
|
|
TextSplitter,
|
|
} from "langchain/text_splitter";
|
|
|
|
import { CallbackManagerForToolRun } from "@langchain/core/callbacks/manager";
|
|
import { BaseLanguageModel } from "langchain/dist/base_language";
|
|
import { formatDocumentsAsString } from "langchain/util/document";
|
|
import { Embeddings } from "langchain/dist/embeddings/base.js";
|
|
import { RunnableSequence } from "@langchain/core/runnables";
|
|
import { StringOutputParser } from "@langchain/core/output_parsers";
|
|
|
|
export const parseInputs = (inputs: string): [string, string] => {
|
|
const [baseUrl, task] = inputs.split(",").map((input) => {
|
|
let t = input.trim();
|
|
t = t.startsWith('"') ? t.slice(1) : t;
|
|
t = t.endsWith('"') ? t.slice(0, -1) : t;
|
|
t = t.endsWith("/") ? t.slice(0, -1) : t;
|
|
return t.trim();
|
|
});
|
|
|
|
return [baseUrl, task];
|
|
};
|
|
|
|
const getPdfBlob = async (baseUrl: string) => {
|
|
const axios = (
|
|
"default" in axiosMod ? axiosMod.default : axiosMod
|
|
) as AxiosStatic;
|
|
let response;
|
|
try {
|
|
response = await axios.get(baseUrl, {
|
|
responseType: "arraybuffer",
|
|
});
|
|
} catch (e) {
|
|
if (axios.isAxiosError(e) && e.response && e.response.status) {
|
|
throw new Error(`http response ${e.response.status}`);
|
|
}
|
|
throw e;
|
|
}
|
|
|
|
const allowedContentTypes = ["application/pdf"];
|
|
|
|
const contentType = response.headers["content-type"];
|
|
const contentTypeArray = contentType.split(";");
|
|
if (
|
|
contentTypeArray[0] &&
|
|
!allowedContentTypes.includes(contentTypeArray[0])
|
|
) {
|
|
throw new Error("returned page was not pdf file");
|
|
}
|
|
const pdfBlob = new Blob([response.data], { type: "application/pdf" });
|
|
return pdfBlob;
|
|
};
|
|
|
|
const getDocsText = (docs: any[]) => {
|
|
let text = "";
|
|
docs.forEach((v) => {
|
|
text += v.pageContent;
|
|
});
|
|
return text;
|
|
};
|
|
|
|
export class PDFBrowser extends Tool {
|
|
static lc_name() {
|
|
return "PDFBrowser";
|
|
}
|
|
|
|
get lc_namespace() {
|
|
return [...super.lc_namespace, "pdfbrowser"];
|
|
}
|
|
|
|
private model: BaseLanguageModel;
|
|
|
|
private embeddings: Embeddings;
|
|
|
|
private textSplitter: TextSplitter;
|
|
|
|
constructor(model: BaseLanguageModel, embeddings: Embeddings) {
|
|
super();
|
|
this.model = model;
|
|
this.embeddings = embeddings;
|
|
this.textSplitter = new RecursiveCharacterTextSplitter({
|
|
chunkSize: 2000,
|
|
chunkOverlap: 200,
|
|
});
|
|
}
|
|
|
|
/** @ignore */
|
|
async _call(inputs: string, runManager?: CallbackManagerForToolRun) {
|
|
const [baseUrl, task] = parseInputs(inputs);
|
|
const doSummary = !task;
|
|
|
|
let pdfBlob;
|
|
try {
|
|
pdfBlob = await getPdfBlob(baseUrl);
|
|
} catch (e) {
|
|
if (e) {
|
|
return e.toString();
|
|
}
|
|
return "There was a problem connecting to the site";
|
|
}
|
|
|
|
const loader = new WebPDFLoader(pdfBlob);
|
|
const docs = await loader.load();
|
|
const vectorStore = await MemoryVectorStore.fromDocuments(
|
|
docs,
|
|
this.embeddings,
|
|
);
|
|
|
|
const texts = await this.textSplitter.splitText(getDocsText(docs));
|
|
|
|
let context;
|
|
// if we want a summary grab first 4
|
|
if (doSummary) {
|
|
context = texts.slice(0, 4).join("\n");
|
|
}
|
|
// search term well embed and grab top 4
|
|
else {
|
|
const results = await vectorStore.similaritySearch(
|
|
task,
|
|
4,
|
|
undefined,
|
|
runManager?.getChild("vectorstore"),
|
|
);
|
|
context = formatDocumentsAsString(results);
|
|
}
|
|
|
|
const input = `Text:${context}\n\nI need ${
|
|
doSummary ? "a summary" : task
|
|
} from the above text.`;
|
|
|
|
console.log("[pdf-browser]", input);
|
|
|
|
const chain = RunnableSequence.from([this.model, new StringOutputParser()]);
|
|
return chain.invoke(input, runManager?.getChild());
|
|
}
|
|
|
|
name = "pdf-browser";
|
|
|
|
description = `useful for when you need to find something on or summarize a pdf file. input should be a comma separated list of "ONE valid http URL including protocol","what you want to find on the pdf page or empty string for a summary".`;
|
|
}
|