ChatGPT-Next-Web/app/api/langchain-tools/pdf_browser.ts

import axiosMod, { AxiosStatic } from "axios";
import { WebPDFLoader } from "langchain/document_loaders/web/pdf";
import { MemoryVectorStore } from "langchain/vectorstores/memory";
import { Tool } from "@langchain/core/tools";
import {
  RecursiveCharacterTextSplitter,
  TextSplitter,
} from "langchain/text_splitter";

import { CallbackManagerForToolRun } from "@langchain/core/callbacks/manager";
import { BaseLanguageModel } from "langchain/dist/base_language";
import { formatDocumentsAsString } from "langchain/util/document";
import { Embeddings } from "langchain/dist/embeddings/base.js";
import { RunnableSequence } from "@langchain/core/runnables";
import { StringOutputParser } from "@langchain/core/output_parsers";

export const parseInputs = (inputs: string): [string, string] => {
  const [baseUrl, task] = inputs.split(",").map((input) => {
    let t = input.trim();
    t = t.startsWith('"') ? t.slice(1) : t;
    t = t.endsWith('"') ? t.slice(0, -1) : t;
    t = t.endsWith("/") ? t.slice(0, -1) : t;
    return t.trim();
  });

  return [baseUrl, task];
};

const getPdfBlob = async (baseUrl: string) => {
  const axios = (
    "default" in axiosMod ? axiosMod.default : axiosMod
  ) as AxiosStatic;
  let response;
  try {
    response = await axios.get(baseUrl, {
      responseType: "arraybuffer",
    });
  } catch (e) {
    if (axios.isAxiosError(e) && e.response && e.response.status) {
      throw new Error(`http response ${e.response.status}`);
    }
    throw e;
  }

  const allowedContentTypes = ["application/pdf"];

  const contentType = response.headers["content-type"];
  const contentTypeArray = contentType.split(";");
  if (
    contentTypeArray[0] &&
    !allowedContentTypes.includes(contentTypeArray[0])
  ) {
    throw new Error("returned page was not pdf file");
  }
  const pdfBlob = new Blob([response.data], { type: "application/pdf" });
  return pdfBlob;
};

const getDocsText = (docs: any[]) => {
  let text = "";
  docs.forEach((v) => {
    text += v.pageContent;
  });
  return text;
};

export class PDFBrowser extends Tool {
  static lc_name() {
    return "PDFBrowser";
  }

  get lc_namespace() {
    return [...super.lc_namespace, "pdfbrowser"];
  }

  private model: BaseLanguageModel;

  private embeddings: Embeddings;

  private textSplitter: TextSplitter;

  constructor(model: BaseLanguageModel, embeddings: Embeddings) {
    super();
    this.model = model;
    this.embeddings = embeddings;
    this.textSplitter = new RecursiveCharacterTextSplitter({
      chunkSize: 2000,
      chunkOverlap: 200,
    });
  }

  /** @ignore */
  async _call(inputs: string, runManager?: CallbackManagerForToolRun) {
    const [baseUrl, task] = parseInputs(inputs);
    const doSummary = !task;

    let pdfBlob;
    try {
      pdfBlob = await getPdfBlob(baseUrl);
    } catch (e) {
      if (e) {
        return e.toString();
      }
      return "There was a problem connecting to the site";
    }

    const loader = new WebPDFLoader(pdfBlob);
    const docs = await loader.load();
    const vectorStore = await MemoryVectorStore.fromDocuments(
      docs,
      this.embeddings,
    );

    const texts = await this.textSplitter.splitText(getDocsText(docs));

    let context;
    // if we want a summary grab first 4
    if (doSummary) {
      context = texts.slice(0, 4).join("\n");
    }
    // search term well embed and grab top 4
    else {
      const results = await vectorStore.similaritySearch(
        task,
        4,
        undefined,
        runManager?.getChild("vectorstore"),
      );
      context = formatDocumentsAsString(results);
    }

    const input = `Text:${context}\n\nI need ${
      doSummary ? "a summary" : task
    } from the above text.`;

    console.log("[pdf-browser]", input);

    const chain = RunnableSequence.from([this.model, new StringOutputParser()]);
    return chain.invoke(input, runManager?.getChild());
  }

  name = "pdf-browser";

  description = `useful for when you need to find something on or summarize a pdf file. input should be a comma separated list of "ONE valid http URL including protocol","what you want to find on the pdf page or empty string for a summary".`;
}