Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: Implement lazy_load() for ArxivLoader #18664

Merged
merged 1 commit into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions libs/community/langchain_community/document_loaders/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional
from typing import Any, Iterator, List, Optional

from langchain_core.documents import Document

Expand All @@ -23,8 +23,8 @@ def __init__(
doc_content_chars_max=doc_content_chars_max, **kwargs
)

def load(self) -> List[Document]:
return self.client.load(self.query)
def lazy_load(self) -> Iterator[Document]:
yield from self.client.lazy_load(self.query)

def get_summaries_as_docs(self) -> List[Document]:
return self.client.get_summaries_as_docs(self.query)
26 changes: 19 additions & 7 deletions libs/community/langchain_community/utilities/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import re
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Iterator, List, Optional

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
Expand Down Expand Up @@ -177,7 +177,22 @@ def load(self, query: str) -> List[Document]:
Args:
query: a plaintext search query
""" # noqa: E501
"""
return list(self.lazy_load(query))

def lazy_load(self, query: str) -> Iterator[Document]:
"""
Run Arxiv search and get the article texts plus the article meta information.
See https://lukasschwab.me/arxiv.py/index.html#Search
Returns: documents with the document.page_content in text format
Performs an arxiv search, downloads the top k results as PDFs, loads
them as Documents, and returns them.
Args:
query: a plaintext search query
"""
try:
import fitz
except ImportError:
Expand All @@ -200,9 +215,8 @@ def load(self, query: str) -> List[Document]:
).results()
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return []
return

docs: List[Document] = []
for result in results:
try:
doc_file_name: str = result.download_pdf()
Expand Down Expand Up @@ -231,9 +245,7 @@ def load(self, query: str) -> List[Document]:
"Summary": result.summary,
**extra_metadata,
}
doc = Document(
yield Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata
)
docs.append(doc)
os.remove(doc_file_name)
return docs