langchain-ai · baskaryan · Apr 2, 2024 · Nov 12, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py b/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py
@@ -34,6 +34,7 @@ def transform_documents(
         unwanted_tags: List[str] = ["script", "style"],
         tags_to_extract: List[str] = ["p", "li", "div", "a"],
         remove_lines: bool = True,
+        remove_comments: bool = True,
         **kwargs: Any,
     ) -> Sequence[Document]:
         """
@@ -43,8 +44,8 @@ def transform_documents(
             documents: A sequence of Document objects containing HTML content.
             unwanted_tags: A list of tags to be removed from the HTML.
             tags_to_extract: A list of tags whose content will be extracted.
-            remove_lines: If set to True, unnecessary lines will be
-            removed from the HTML content.
+            remove_lines: If set to True, unnecessary lines will be removed.
+            remove_comments: If set to True, comments will be removed.
 
         Returns:
             A sequence of Document objects with transformed content.
@@ -54,7 +55,9 @@ def transform_documents(
 
             cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
 
-            cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
+            cleaned_content = self.extract_tags(
+                cleaned_content, tags_to_extract, remove_comments
+            )
 
             if remove_lines:
                 cleaned_content = self.remove_unnecessary_lines(cleaned_content)
@@ -84,7 +87,7 @@ def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
         return str(soup)
 
     @staticmethod
-    def extract_tags(html_content: str, tags: List[str]) -> str:
+    def extract_tags(html_content: str, tags: List[str], remove_comments: bool) -> str:
         """
         Extract specific tags from a given HTML content.
 
@@ -102,7 +105,7 @@ def extract_tags(html_content: str, tags: List[str]) -> str:
         for element in soup.find_all():
             if element.name in tags:
                 # Extract all navigable strings recursively from this element.
-                text_parts += get_navigable_strings(element)
+                text_parts += get_navigable_strings(element, remove_comments)
 
                 # To avoid duplicate text, remove all descendants from the soup.
                 element.decompose()
@@ -134,12 +137,14 @@ async def atransform_documents(
         raise NotImplementedError
 
 
-def get_navigable_strings(element: Any) -> Iterator[str]:
-    from bs4 import NavigableString, Tag
+def get_navigable_strings(element: Any, remove_comments: bool) -> Iterator[str]:
+    from bs4 import Comment, NavigableString, Tag
 
     for child in cast(Tag, element).children:
+        if isinstance(child, Comment) and remove_comments:
+            continue
         if isinstance(child, Tag):
-            yield from get_navigable_strings(child)
+            yield from get_navigable_strings(child, remove_comments)
         elif isinstance(child, NavigableString):
             if (element.name == "a") and (href := element.get("href")):
                 yield f"{child.strip()} ({href})"

diff --git a/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py
@@ -230,3 +230,37 @@ def test_invalid_html() -> None:
     )
     assert docs_transformed[0].page_content == "First heading."
     assert docs_transformed[1].page_content == ""
+
+
+@pytest.mark.requires("bs4")
+def test_remove_comments_by_default() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    html_with_comments = (
+        "<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
+    )
+    documents = [
+        Document(page_content=html_with_comments),
+    ]
+
+    docs_transformed = bs_transformer.transform_documents(
+        documents, tags_to_extract=["html"]
+    )
+    assert docs_transformed[0].page_content == "First paragraph."
+
+
+@pytest.mark.requires("bs4")
+def test_optionally_do_not_remove_comments() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    html_with_comments = (
+        "<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
+    )
+    documents = [
+        Document(page_content=html_with_comments),
+    ]
+
+    docs_transformed = bs_transformer.transform_documents(
+        documents,
+        tags_to_extract=["html"],
+        remove_comments=False,
+    )
+    assert docs_transformed[0].page_content == "Google tag (gtag.js) First paragraph."