Skip to content

Commit

Permalink
community[patch]: Add remove_comments option (default True): do not e…
Browse files Browse the repository at this point in the history
…xtract html comments (langchain-ai#13259)

- **Description:** add `remove_comments` option (default: True): do not
extract html _comments_,
  - **Issue:** None,
  - **Dependencies:** None,
  - **Tag maintainer:** @nfcampos ,
  - **Twitter handle:** peter_v

I ran `make format`, `make lint` and `make test`.

Discussion: I my use case, I prefer to not have the comments in the
extracted text:
* e.g. from a Google tag that is added in the html as comment
* e.g. content that the authors have temporarily hidden to make it non
visible to the regular reader

Removing the comments makes the extracted text more alike the intended
text to be seen by the reader.


**Choice to make:** do we prefer to make the default for this
`remove_comments` option to be True or False?
I have changed it to True in a second commit, since that is how I would
prefer to use it by default. Have the
cleaned text (without technical Google tags etc.) and also closer to the
actually visible and intended content.
I am not sure what is best aligned with the conventions of langchain in
general ...


INITIAL VERSION (new version above):
~**Choice to make:** do we prefer to make the default for this
`ignore_comments` option to be True or False?
I have set it to False now to be backwards compatible. On the other
hand, I would use it mostly with True.
I am not sure what is best aligned with the conventions of langchain in
general ...~

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
  • Loading branch information
2 people authored and marlenezw committed Apr 2, 2024
1 parent c95a280 commit 1f33504
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def transform_documents(
unwanted_tags: List[str] = ["script", "style"],
tags_to_extract: List[str] = ["p", "li", "div", "a"],
remove_lines: bool = True,
*,
remove_comments: bool = False,
**kwargs: Any,
) -> Sequence[Document]:
"""
Expand All @@ -45,8 +47,8 @@ def transform_documents(
documents: A sequence of Document objects containing HTML content.
unwanted_tags: A list of tags to be removed from the HTML.
tags_to_extract: A list of tags whose content will be extracted.
remove_lines: If set to True, unnecessary lines will be
removed from the HTML content.
remove_lines: If set to True, unnecessary lines will be removed.
remove_comments: If set to True, comments will be removed.
Returns:
A sequence of Document objects with transformed content.
Expand All @@ -56,7 +58,9 @@ def transform_documents(

cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)

cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
cleaned_content = self.extract_tags(
cleaned_content, tags_to_extract, remove_comments=remove_comments
)

if remove_lines:
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
Expand Down Expand Up @@ -86,7 +90,9 @@ def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
return str(soup)

@staticmethod
def extract_tags(html_content: str, tags: List[str]) -> str:
def extract_tags(
html_content: str, tags: List[str], *, remove_comments: bool = False
) -> str:
"""
Extract specific tags from a given HTML content.
Expand All @@ -104,7 +110,9 @@ def extract_tags(html_content: str, tags: List[str]) -> str:
for element in soup.find_all():
if element.name in tags:
# Extract all navigable strings recursively from this element.
text_parts += get_navigable_strings(element)
text_parts += get_navigable_strings(
element, remove_comments=remove_comments
)

# To avoid duplicate text, remove all descendants from the soup.
element.decompose()
Expand Down Expand Up @@ -136,7 +144,9 @@ async def atransform_documents(
raise NotImplementedError


def get_navigable_strings(element: Any) -> Iterator[str]:
def get_navigable_strings(
element: Any, *, remove_comments: bool = False
) -> Iterator[str]:
"""Get all navigable strings from a BeautifulSoup element.
Args:
Expand All @@ -146,11 +156,13 @@ def get_navigable_strings(element: Any) -> Iterator[str]:
A generator of strings.
"""

from bs4 import NavigableString, Tag
from bs4 import Comment, NavigableString, Tag

for child in cast(Tag, element).children:
if isinstance(child, Comment) and remove_comments:
continue
if isinstance(child, Tag):
yield from get_navigable_strings(child)
yield from get_navigable_strings(child, remove_comments=remove_comments)
elif isinstance(child, NavigableString):
if (element.name == "a") and (href := element.get("href")):
yield f"{child.strip()} ({href})"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,36 @@ def test_invalid_html() -> None:
)
assert docs_transformed[0].page_content == "First heading."
assert docs_transformed[1].page_content == ""


@pytest.mark.requires("bs4")
def test_remove_comments() -> None:
bs_transformer = BeautifulSoupTransformer()
html_with_comments = (
"<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
)
documents = [
Document(page_content=html_with_comments),
]

docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["html"], remove_comments=True
)
assert docs_transformed[0].page_content == "First paragraph."


@pytest.mark.requires("bs4")
def test_do_not_remove_comments() -> None:
bs_transformer = BeautifulSoupTransformer()
html_with_comments = (
"<html><!-- Google tag (gtag.js) --><p>First paragraph.</p</html>"
)
documents = [
Document(page_content=html_with_comments),
]

docs_transformed = bs_transformer.transform_documents(
documents,
tags_to_extract=["html"],
)
assert docs_transformed[0].page_content == "Google tag (gtag.js) First paragraph."

0 comments on commit 1f33504

Please sign in to comment.