Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add remove_comments option (default True): do not extract html comments #13259

Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def transform_documents(
unwanted_tags: List[str] = ["script", "style"],
tags_to_extract: List[str] = ["p", "li", "div", "a"],
remove_lines: bool = True,
remove_comments: bool = True,
**kwargs: Any,
) -> Sequence[Document]:
"""
Expand All @@ -43,8 +44,8 @@ def transform_documents(
documents: A sequence of Document objects containing HTML content.
unwanted_tags: A list of tags to be removed from the HTML.
tags_to_extract: A list of tags whose content will be extracted.
remove_lines: If set to True, unnecessary lines will be
removed from the HTML content.
remove_lines: If set to True, unnecessary lines will be removed.
remove_comments: If set to True, comments will be removed.

Returns:
A sequence of Document objects with transformed content.
Expand All @@ -54,7 +55,9 @@ def transform_documents(

cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)

cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
cleaned_content = self.extract_tags(
cleaned_content, tags_to_extract, remove_comments
)

if remove_lines:
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
Expand Down Expand Up @@ -84,7 +87,7 @@ def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
return str(soup)

@staticmethod
def extract_tags(html_content: str, tags: List[str]) -> str:
def extract_tags(html_content: str, tags: List[str], remove_comments: bool) -> str:
"""
Extract specific tags from a given HTML content.

Expand All @@ -102,7 +105,7 @@ def extract_tags(html_content: str, tags: List[str]) -> str:
for element in soup.find_all():
if element.name in tags:
# Extract all navigable strings recursively from this element.
text_parts += get_navigable_strings(element)
text_parts += get_navigable_strings(element, remove_comments)

# To avoid duplicate text, remove all descendants from the soup.
element.decompose()
Expand Down Expand Up @@ -134,12 +137,14 @@ async def atransform_documents(
raise NotImplementedError


def get_navigable_strings(element: Any) -> Iterator[str]:
from bs4 import NavigableString, Tag
def get_navigable_strings(element: Any, remove_comments: bool) -> Iterator[str]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: since this is technically a public method could we avoid breaking changes and give remove_comments a default value?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a public method we introduced only a few days ago, so probably fine to leave as is, but up to you

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the fast review. Good remark. For me this method get_navigable_strings should not really be public (it seems only intended to be used deep inside this code, not generally ?).

Could we make it more "private" (not really private in Python, but anyway), and bring inside the BeautifulSoupTransformer class? Maybe as a static method inside the class, like the other methods?

Then we could also do the import only once in the init, or re-use the import bs4 command that is already present there?

If you think that is better, I could try to adapt the code in that direction (to a class_method inside the class).

My more fundamental question on "breaking interface" is the question if the "remove_comments" should be default True , like in my current proposal (but I am open to the alternatives as well ...).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id minorly prefer to make it default to False (to your point about backwards compat) and also to add a default here (seems easy)

from bs4 import Comment, NavigableString, Tag

for child in cast(Tag, element).children:
if isinstance(child, Comment) and remove_comments:
continue
if isinstance(child, Tag):
yield from get_navigable_strings(child)
yield from get_navigable_strings(child, remove_comments)
elif isinstance(child, NavigableString):
if (element.name == "a") and (href := element.get("href")):
yield f"{child.strip()} ({href})"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,37 @@ def test_invalid_html() -> None:
)
assert docs_transformed[0].page_content == "First heading."
assert docs_transformed[1].page_content == ""


@pytest.mark.requires("bs4")
def test_remove_comments_by_default() -> None:
bs_transformer = BeautifulSoupTransformer()
html_with_comments = (
"<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
)
documents = [
Document(page_content=html_with_comments),
]

docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["html"]
)
assert docs_transformed[0].page_content == "First paragraph."


@pytest.mark.requires("bs4")
def test_optionally_do_not_remove_comments() -> None:
bs_transformer = BeautifulSoupTransformer()
html_with_comments = (
"<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
)
documents = [
Document(page_content=html_with_comments),
]

docs_transformed = bs_transformer.transform_documents(
documents,
tags_to_extract=["html"],
remove_comments=False,
)
assert docs_transformed[0].page_content == "Google tag (gtag.js) First paragraph."