Skip to content

Commit

Permalink
community[minor]: Implement lazy_load() for GitbookLoader (langchain-…
Browse files Browse the repository at this point in the history
…ai#18670)

Integration test:
`tests/integration_tests/document_loaders/test_gitbook.py`
  • Loading branch information
cbornet authored and Dave Bechberger committed Mar 29, 2024
1 parent 847f24f commit 2fb3f7f
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions libs/community/langchain_community/document_loaders/gitbook.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional
from typing import Any, Iterator, List, Optional
from urllib.parse import urljoin, urlparse

from langchain_core.documents import Document
Expand Down Expand Up @@ -47,23 +47,23 @@ def __init__(
self.load_all_paths = load_all_paths
self.content_selector = content_selector

def load(self) -> List[Document]:
def lazy_load(self) -> Iterator[Document]:
"""Fetch text from one single GitBook page."""
if self.load_all_paths:
soup_info = self.scrape()
relative_paths = self._get_paths(soup_info)
urls = [urljoin(self.base_url, path) for path in relative_paths]
soup_infos = self.scrape_all(urls)
_documents = [
self._get_document(soup_info, url)
for soup_info, url in zip(soup_infos, urls)
]
for soup_info, url in zip(soup_infos, urls):
doc = self._get_document(soup_info, url)
if doc:
yield doc

else:
soup_info = self.scrape()
_documents = [self._get_document(soup_info, self.web_path)]
documents = [d for d in _documents if d]

return documents
doc = self._get_document(soup_info, self.web_path)
if doc:
yield doc

def _get_document(
self, soup: Any, custom_url: Optional[str] = None
Expand Down

0 comments on commit 2fb3f7f

Please sign in to comment.