Skip to content

Commit

Permalink
community[minor]: use jq schema for content_key in json_loader (langc…
Browse files Browse the repository at this point in the history
…hain-ai#18003)

### Description
Changed the value specified for `content_key` in JSONLoader from a
single key to a value based on jq schema.
I created [similar
PR](langchain-ai#11255) before, but it
has several conflicts because of the architectural change associated
stable version release, so I re-create this PR to fit new architecture.

### Why
For json data like the following, specify `.data[].attributes.message`
for page_content and `.data[].attributes.id` or
`.data[].attributes.attributes. tags`, etc., the `content_key` must also
parse the json structure.

<details>
<summary>sample json data</summary>

```json
{
  "data": [
    {
      "attributes": {
        "message": "message1",
        "tags": [
          "tag1"
        ]
      },
      "id": "1"
    },
    {
      "attributes": {
        "message": "message2",
        "tags": [
          "tag2"
        ]
      },
      "id": "2"
    }
  ]
}
```

</details>

<details>
<summary>sample code</summary>

```python
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["source"] = None
    metadata["id"] = record.get("id")
    metadata["tags"] = record["attributes"].get("tags")

    return metadata

sample_file = "sample1.json"
loader = JSONLoader(
    file_path=sample_file,
    jq_schema=".data[]",
    content_key=".attributes.message", ## content_key is parsable into jq schema
    is_content_key_jq_parsable=True, ## this is added parameter
    metadata_func=metadata_func
)

data = loader.load()
data
```

</details>

### Dependencies
none

### Twitter handle
[kzk_maeda](https://twitter.com/kzk_maeda)
  • Loading branch information
kzk-maeda authored and Dave Bechberger committed Mar 29, 2024
1 parent 16419aa commit 4ac4195
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 4 deletions.
52 changes: 52 additions & 0 deletions docs/docs/modules/data_connection/document_loaders/json.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,58 @@ pprint(data)

</CodeOutputBlock>

### JSON file with jq schema `content_key`

To load documents from a JSON file using the content_key within the jq schema, set is_content_key_jq_parsable=True.
Ensure that content_key is compatible and can be parsed using the jq schema.

```python
file_path = './sample.json'
pprint(Path(file_path).read_text())
```

<CodeOutputBlock lang="python">

```json
{"data": [
{"attributes": {
"message": "message1",
"tags": [
"tag1"]},
"id": "1"},
{"attributes": {
"message": "message2",
"tags": [
"tag2"]},
"id": "2"}]}
```

</CodeOutputBlock>


```python
loader = JSONLoader(
file_path=file_path,
jq_schema=".data[]",
content_key=".attributes.message",
is_content_key_jq_parsable=True,
)

data = loader.load()
```

```python
pprint(data)
```

<CodeOutputBlock lang="python">

```
[Document(page_content='message1', metadata={'source': '/path/to/sample.json', 'seq_num': 1}),
Document(page_content='message2', metadata={'source': '/path/to/sample.json', 'seq_num': 2})]
```

</CodeOutputBlock>

## Extracting metadata

Expand Down
36 changes: 32 additions & 4 deletions libs/community/langchain_community/document_loaders/json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
file_path: Union[str, Path],
jq_schema: str,
content_key: Optional[str] = None,
is_content_key_jq_parsable: Optional[bool] = False,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
json_lines: bool = False,
Expand All @@ -31,8 +32,16 @@ def __init__(
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
jq_schema (str): The jq schema to use to extract the data or text from
the JSON.
content_key (str): The key to use to extract the content from the JSON if
the jq_schema results to a list of objects (dict).
content_key (str): The key to use to extract the content from
the JSON if the jq_schema results to a list of objects (dict).
If is_content_key_jq_parsable is True, this has to be a jq compatible
schema. If is_content_key_jq_parsable is False, this should be a simple
string key.
is_content_key_jq_parsable (bool): A flag to determine if
content_key is parsable by jq or not. If True, content_key is
treated as a jq schema and compiled accordingly. If False or if
content_key is None, content_key is used as a simple string.
Default is False.
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata.
Expand All @@ -43,13 +52,16 @@ def __init__(
"""
try:
import jq # noqa:F401

self.jq = jq
except ImportError:
raise ImportError(
"jq package not found, please install it with `pip install jq`"
)

self.file_path = Path(file_path).resolve()
self._jq_schema = jq.compile(jq_schema)
self._is_content_key_jq_parsable = is_content_key_jq_parsable
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
Expand Down Expand Up @@ -90,7 +102,11 @@ def _parse(self, content: str, docs: List[Document]) -> None:
def _get_text(self, sample: Any) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
content = sample.get(self._content_key)
if self._is_content_key_jq_parsable:
compiled_content_key = self.jq.compile(self._content_key)
content = compiled_content_key.input(sample).first()
else:
content = sample[self._content_key]
else:
content = sample

Expand Down Expand Up @@ -125,18 +141,30 @@ def _get_metadata(

def _validate_content_key(self, data: Any) -> None:
"""Check if a content key is valid"""

sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict), \
so sample must be a dict but got `{type(sample)}`"
)

if sample.get(self._content_key) is None:
if (
not self._is_content_key_jq_parsable
and sample.get(self._content_key) is None
):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
if (
self._is_content_key_jq_parsable
and self.jq.compile(self._content_key).input(sample).text() is None
):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}` which should be parsable by jq"
)

def _validate_metadata_func(self, data: Any) -> None:
"""Check if the metadata_func output is valid"""
Expand Down
120 changes: 120 additions & 0 deletions libs/community/tests/unit_tests/document_loaders/test_json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,123 @@ def metadata_func(record: Dict, metadata: Dict) -> Dict:
result = loader.load()

assert result == expected_docs


@pytest.mark.parametrize(
"params",
(
{"jq_schema": ".[].text"},
{"jq_schema": ".[]", "content_key": "text"},
{
"jq_schema": ".[]",
"content_key": ".text",
"is_content_key_jq_parsable": True,
},
),
)
def test_load_json_with_jq_parsable_content_key(
params: Dict, mocker: MockerFixture
) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
]

mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
[{"text": "value1"}, {"text": "value2"}]
"""
),
)

loader = JSONLoader(file_path=file_path, json_lines=True, **params)
result = loader.load()

assert result == expected_docs


def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="message1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="message2",
metadata={"source": file_path, "seq_num": 2},
),
]

mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"data": [
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
"""
),
)

loader = JSONLoader(
file_path=file_path,
jq_schema=".data[]",
content_key=".attributes.message",
is_content_key_jq_parsable=True,
)
result = loader.load()

assert result == expected_docs


def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
mocker: MockerFixture,
) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="message1",
metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]},
),
Document(
page_content="message2",
metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]},
),
]

mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"data": [
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
"""
),
)

def _metadata_func(record: dict, metadata: dict) -> dict:
metadata["id"] = record.get("id")
metadata["tags"] = record["attributes"].get("tags")
return metadata

loader = JSONLoader(
file_path=file_path,
jq_schema=".data[]",
content_key=".attributes.message",
is_content_key_jq_parsable=True,
metadata_func=_metadata_func,
)
result = loader.load()

assert result == expected_docs

0 comments on commit 4ac4195

Please sign in to comment.