From 3f153fe8bdbb2162066e523c780c813906597a38 Mon Sep 17 00:00:00 2001 From: bili Date: Tue, 14 Jan 2025 17:04:49 +0800 Subject: [PATCH] Modify ArxivAPIWrapper 1. Replace raw xml input to parsed data 2. Modify prompts and default values of zod request --- app/api/langchain-tools/arxiv.ts | 120 +++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 15 deletions(-) diff --git a/app/api/langchain-tools/arxiv.ts b/app/api/langchain-tools/arxiv.ts index 7cca00eee..c953ec7d9 100644 --- a/app/api/langchain-tools/arxiv.ts +++ b/app/api/langchain-tools/arxiv.ts @@ -1,5 +1,84 @@ import { StructuredTool } from "@langchain/core/tools"; import { z } from "zod"; +import { XMLParser } from "fast-xml-parser"; + +// Credit: ArxivRetriever from Langchain.js +interface ArxivEntry { + id: string; + title: string; + summary: string; + published: string; + authors: string[]; + pdfUrl: string; + links: any[]; +} +function parseArxivEntry(entry: any): ArxivEntry { + const title = entry.title.replace(/\s+/g, " ").trim(); + const summary = entry.summary.replace(/\s+/g, " ").trim(); + const published = entry.published; + + // Extract authors + let authors: string[] = []; + if (Array.isArray(entry.author)) { + authors = entry.author.map((author: any) => author.name); + } else if (entry.author) { + authors = [entry.author.name]; + } + // Extract links + let links: any[] = []; + if (Array.isArray(entry.link)) { + links = entry.link; + } else if (entry.link) { + links = [entry.link]; + } + // Extract PDF link + let pdfUrl = entry.id.replace("/abs/", "/pdf/") + ".pdf"; + const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf"); + if (pdfLinkObj && pdfLinkObj["@_href"]) { + pdfUrl = pdfLinkObj["@_href"]; + } + return { + id: entry.id, + title: title, + summary: summary, + published: published, + authors, + pdfUrl, + links: entry.links, + }; +} + +function parseArxivResponse(response: string): ArxivEntry[] { + const options = { + attributeNamePrefix: "@_", + ignoreAttributes: false, + parseNodeValue: true, + parseAttributeValue: true, + trimValues: true, + ignoreNameSpace: true, + }; + const parser = new XMLParser(options); + const results = parser.parse(response); + const entries = results.feed.entry; + if (!entries) { + return []; + } + return entries.map(parseArxivEntry); +} +async function buildArxivResponse(query: string): Promise { + const article_list = await parseArxivResponse(query); + if (article_list.length === 0) { + return `Found no article in arxiv database`; + } else { + let response = `Found these articles in arxiv database \n\n`; + const articles_str = article_list.map((article) => { + return `Title: ${article.title}\nAuthors: ${article.authors.join(", ")}\n + Summary: ${article.summary}\nPublished: ${article.published}\n + PDF: ${article.pdfUrl}`; + }); + return `${response} \n \n ${articles_str.join("\n\n")}`; + } +} export class ArxivAPIWrapper extends StructuredTool { get lc_namespace() { @@ -7,7 +86,8 @@ export class ArxivAPIWrapper extends StructuredTool { } name = "arxiv"; - description = "Run Arxiv search and get the article information."; + description = + "Useful if you need to look for academical papers on arxiv. You can search by title, author, abstract, etc."; SORT_BY = { RELEVANCE: "relevance", @@ -21,23 +101,32 @@ export class ArxivAPIWrapper extends StructuredTool { }; schema = z.object({ - searchQuery: z - .string() - .describe("same as the search_query parameter rules of the arxiv API."), + searchQuery: z.string().describe("topic of your query"), sortBy: z .string() - .describe('can be "relevance", "lastUpdatedDate", "submittedDate".'), + .optional() + .default(this.SORT_BY.RELEVANCE) + .describe( + 'sort rules, can be "relevance", "lastUpdatedDate", "submittedDate". Default by relevance if no' + + "additional request is made.", + ), sortOrder: z .string() - .describe('can be either "ascending" or "descending".'), + .optional() + .default(this.SORT_ORDER.DESCENDING) + .describe( + 'order of sort, can be either "ascending" or "descending". Default by descending.', + ), start: z .number() + .optional() .default(0) - .describe("the index of the first returned result."), + .describe("the index of the first returned result. Default 0."), maxResults: z .number() - .default(10) - .describe("the number of results returned by the query."), + .optional() + .default(20) + .describe("the number of returned items. Default 20."), }); async _call({ @@ -62,17 +151,18 @@ export class ArxivAPIWrapper extends StructuredTool { ); } try { - let url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}${ + let url = `https://export.arxiv.org/api/query?search_query=all:${searchQuery}&start=${start}&max_results=${maxResults}${ sortBy ? `&sortBy=${sortBy}` : "" }${sortOrder ? `&sortOrder=${sortOrder}` : ""}`; console.log("[arxiv]", url); - const response = await fetch(url); - const data = await response.text(); - console.log("[arxiv]", data); - return data; + const api_response = await fetch(url); + const response_text = await api_response.text(); + const arxiv_data = await buildArxivResponse(response_text); + console.log("[arxiv]", arxiv_data); + return arxiv_data; } catch (e) { console.error("[arxiv]", e); } - return "not found"; + return `Invalid request ${searchQuery}`; } }