Modify ArxivAPIWrapper

1. Replace raw xml input to parsed data 2. Modify prompts and default values of zod request
2025-05-28 16:40:20 +09:00 · 2025-01-14 17:04:49 +08:00 · 2025-01-14 17:04:49 +08:00 · 3f153fe8bd
commit 3f153fe8bd
parent 7618eaf20d
1 changed files with 105 additions and 15 deletions
--- a/app/api/langchain-tools/arxiv.ts
+++ b/app/api/langchain-tools/arxiv.ts
@ -1,5 +1,84 @@
 import { StructuredTool } from "@langchain/core/tools";
 import { z } from "zod";
+import { XMLParser } from "fast-xml-parser";
+
+// Credit: ArxivRetriever from Langchain.js
+interface ArxivEntry {
+  id: string;
+  title: string;
+  summary: string;
+  published: string;
+  authors: string[];
+  pdfUrl: string;
+  links: any[];
+}
+function parseArxivEntry(entry: any): ArxivEntry {
+  const title = entry.title.replace(/\s+/g, " ").trim();
+  const summary = entry.summary.replace(/\s+/g, " ").trim();
+  const published = entry.published;
+
+  // Extract authors
+  let authors: string[] = [];
+  if (Array.isArray(entry.author)) {
+    authors = entry.author.map((author: any) => author.name);
+  } else if (entry.author) {
+    authors = [entry.author.name];
+  }
+  // Extract links
+  let links: any[] = [];
+  if (Array.isArray(entry.link)) {
+    links = entry.link;
+  } else if (entry.link) {
+    links = [entry.link];
+  }
+  // Extract PDF link
+  let pdfUrl = entry.id.replace("/abs/", "/pdf/") + ".pdf";
+  const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf");
+  if (pdfLinkObj && pdfLinkObj["@_href"]) {
+    pdfUrl = pdfLinkObj["@_href"];
+  }
+  return {
+    id: entry.id,
+    title: title,
+    summary: summary,
+    published: published,
+    authors,
+    pdfUrl,
+    links: entry.links,
+  };
+}
+
+function parseArxivResponse(response: string): ArxivEntry[] {
+  const options = {
+    attributeNamePrefix: "@_",
+    ignoreAttributes: false,
+    parseNodeValue: true,
+    parseAttributeValue: true,
+    trimValues: true,
+    ignoreNameSpace: true,
+  };
+  const parser = new XMLParser(options);
+  const results = parser.parse(response);
+  const entries = results.feed.entry;
+  if (!entries) {
+    return [];
+  }
+  return entries.map(parseArxivEntry);
+}
+async function buildArxivResponse(query: string): Promise<string> {
+  const article_list = await parseArxivResponse(query);
+  if (article_list.length === 0) {
+    return `Found no article in arxiv database`;
+  } else {
+    let response = `Found these articles in arxiv database \n\n`;
+    const articles_str = article_list.map((article) => {
+      return `Title: ${article.title}\nAuthors: ${article.authors.join(", ")}\n
+            Summary: ${article.summary}\nPublished: ${article.published}\n
+            PDF: ${article.pdfUrl}`;
+    });
+    return `${response} \n \n ${articles_str.join("\n\n")}`;
+  }
+}

 export class ArxivAPIWrapper extends StructuredTool {
  get lc_namespace() {
@ -7,7 +86,8 @@ export class ArxivAPIWrapper extends StructuredTool {
  }

  name = "arxiv";
-  description = "Run Arxiv search and get the article information.";
+  description =
+    "Useful if you need to look for academical papers on arxiv. You can search by title, author, abstract, etc.";

  SORT_BY = {
    RELEVANCE: "relevance",
@ -21,23 +101,32 @@ export class ArxivAPIWrapper extends StructuredTool {
  };

  schema = z.object({
-    searchQuery: z
-      .string()
-      .describe("same as the search_query parameter rules of the arxiv API."),
+    searchQuery: z.string().describe("topic of your query"),
    sortBy: z
      .string()
-      .describe('can be "relevance", "lastUpdatedDate", "submittedDate".'),
+      .optional()
+      .default(this.SORT_BY.RELEVANCE)
+      .describe(
+        'sort rules, can be "relevance", "lastUpdatedDate", "submittedDate". Default by relevance if no' +
+          "additional request is made.",
+      ),
    sortOrder: z
      .string()
-      .describe('can be either "ascending" or "descending".'),
+      .optional()
+      .default(this.SORT_ORDER.DESCENDING)
+      .describe(
+        'order of sort, can be either "ascending" or "descending". Default by descending.',
+      ),
    start: z
      .number()
+      .optional()
      .default(0)
-      .describe("the index of the first returned result."),
+      .describe("the index of the first returned result. Default 0."),
    maxResults: z
      .number()
-      .default(10)
-      .describe("the number of results returned by the query."),
+      .optional()
+      .default(20)
+      .describe("the number of returned items. Default 20."),
  });

  async _call({
@ -62,17 +151,18 @@ export class ArxivAPIWrapper extends StructuredTool {
      );
    }
    try {
-      let url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}${
+      let url = `https://export.arxiv.org/api/query?search_query=all:${searchQuery}&start=${start}&max_results=${maxResults}${
        sortBy ? `&sortBy=${sortBy}` : ""
      }${sortOrder ? `&sortOrder=${sortOrder}` : ""}`;
      console.log("[arxiv]", url);
-      const response = await fetch(url);
-      const data = await response.text();
-      console.log("[arxiv]", data);
-      return data;
+      const api_response = await fetch(url);
+      const response_text = await api_response.text();
+      const arxiv_data = await buildArxivResponse(response_text);
+      console.log("[arxiv]", arxiv_data);
+      return arxiv_data;
    } catch (e) {
      console.error("[arxiv]", e);
    }
-    return "not found";
+    return `Invalid request ${searchQuery}`;
  }
 }