Skip to content

Commit

Permalink
Implement lazy_load() for ArxivLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
cbornet committed Mar 6, 2024
1 parent 81985b3 commit df896de
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
6 changes: 3 additions & 3 deletions libs/community/langchain_community/document_loaders/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional
from typing import Any, Iterator, List, Optional

from langchain_core.documents import Document

Expand All @@ -23,8 +23,8 @@ def __init__(
doc_content_chars_max=doc_content_chars_max, **kwargs
)

def load(self) -> List[Document]:
return self.client.load(self.query)
def lazy_load(self) -> Iterator[Document]:
yield from self.client.lazy_load(self.query)

def get_summaries_as_docs(self) -> List[Document]:
return self.client.get_summaries_as_docs(self.query)
26 changes: 19 additions & 7 deletions libs/community/langchain_community/utilities/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import re
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Iterator, List, Optional

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
Expand Down Expand Up @@ -177,7 +177,22 @@ def load(self, query: str) -> List[Document]:
Args:
query: a plaintext search query
""" # noqa: E501
"""
return list(self.lazy_load(query))

def lazy_load(self, query: str) -> Iterator[Document]:
"""
Run Arxiv search and get the article texts plus the article meta information.
See https://lukasschwab.me/arxiv.py/index.html#Search
Returns: documents with the document.page_content in text format
Performs an arxiv search, downloads the top k results as PDFs, loads
them as Documents, and returns them.
Args:
query: a plaintext search query
"""
try:
import fitz
except ImportError:
Expand All @@ -200,9 +215,8 @@ def load(self, query: str) -> List[Document]:
).results()
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return []
return

docs: List[Document] = []
for result in results:
try:
doc_file_name: str = result.download_pdf()
Expand Down Expand Up @@ -231,9 +245,7 @@ def load(self, query: str) -> List[Document]:
"Summary": result.summary,
**extra_metadata,
}
doc = Document(
yield Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata
)
docs.append(doc)
os.remove(doc_file_name)
return docs

0 comments on commit df896de

Please sign in to comment.