community[minor]: Implement lazy_load() for GitbookLoader (langchain-…

…ai#18670) Integration test: `tests/integration_tests/document_loaders/test_gitbook.py`
bechbd · Mar 29, 2024 · 2fb3f7f · 2fb3f7f
1 parent 847f24f
commit 2fb3f7f
Showing 1 changed file with 10 additions and 10 deletions.
diff --git a/libs/community/langchain_community/document_loaders/gitbook.py b/libs/community/langchain_community/document_loaders/gitbook.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional
+from typing import Any, Iterator, List, Optional
 from urllib.parse import urljoin, urlparse
 
 from langchain_core.documents import Document
@@ -47,23 +47,23 @@ def __init__(
         self.load_all_paths = load_all_paths
         self.content_selector = content_selector
 
-    def load(self) -> List[Document]:
+    def lazy_load(self) -> Iterator[Document]:
         """Fetch text from one single GitBook page."""
         if self.load_all_paths:
             soup_info = self.scrape()
             relative_paths = self._get_paths(soup_info)
             urls = [urljoin(self.base_url, path) for path in relative_paths]
             soup_infos = self.scrape_all(urls)
-            _documents = [
-                self._get_document(soup_info, url)
-                for soup_info, url in zip(soup_infos, urls)
-            ]
+            for soup_info, url in zip(soup_infos, urls):
+                doc = self._get_document(soup_info, url)
+                if doc:
+                    yield doc
+
         else:
             soup_info = self.scrape()
-            _documents = [self._get_document(soup_info, self.web_path)]
-        documents = [d for d in _documents if d]
-
-        return documents
+            doc = self._get_document(soup_info, self.web_path)
+            if doc:
+                yield doc
 
     def _get_document(
         self, soup: Any, custom_url: Optional[str] = None