Skip to content

Commit

Permalink
community[minor]: add Yuque document loader (langchain-ai#17924)
Browse files Browse the repository at this point in the history
This pull request support loading documents from Yuque with Langchain.

Yuque is a professional cloud-based knowledge base for team
collaboration in documentation.

Website: https://www.yuque.com
OpenAPI: https://www.yuque.com/yuque/developer/openapi
  • Loading branch information
Dounx authored and thebhulawat committed Mar 6, 2024
1 parent ae81024 commit fe9a8d4
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 0 deletions.
77 changes: 77 additions & 0 deletions docs/docs/integrations/document_loaders/yuque.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "66a7777e",
"metadata": {},
"source": [
"# Yuque\n",
"\n",
">[Yuque](https://www.yuque.com/) is a professional cloud-based knowledge base for team collaboration in documentation.\n",
"\n",
"This notebook covers how to load documents from `Yuque`.\n",
"\n",
"You can obtain the personal access token by clicking on your personal avatar in the [Personal Settings](https://www.yuque.com/settings/tokens) page."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ec8a3b3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_community.document_loaders import YuqueLoader"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"loader = YuqueLoader(access_token=\"<your_personal_access_token>\")"
],
"metadata": {
"collapsed": false
},
"id": "2ea958f0327ed6e8"
},
{
"cell_type": "code",
"execution_count": null,
"id": "3470dadf",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@
GoogleApiYoutubeLoader,
YoutubeLoader,
)
from langchain_community.document_loaders.yuque import YuqueLoader

# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader
Expand Down Expand Up @@ -421,4 +422,5 @@
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
"YuqueLoader",
]
92 changes: 92 additions & 0 deletions libs/community/langchain_community/document_loaders/yuque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re
from typing import Dict, Iterator, List

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


class YuqueLoader(BaseLoader):
"""Load documents from `Yuque`."""

def __init__(self, access_token: str, api_url: str = "https://www.yuque.com"):
"""Initialize with Yuque access_token and api_url.
Args:
access_token: Personal access token - see https://www.yuque.com/settings/tokens.
api_url: Yuque API url.
"""
self.access_token = access_token
self.api_url = api_url

@property
def headers(self) -> Dict[str, str]:
return {
"Content-Type": "application/json",
"X-Auth-Token": self.access_token,
}

def get_user_id(self) -> int:
url = f"{self.api_url}/api/v2/user"
response = self.http_get(url=url)

return response["data"]["id"]

def get_books(self, user_id: int) -> List[Dict]:
url = f"{self.api_url}/api/v2/users/{user_id}/repos"
response = self.http_get(url=url)

return response["data"]

def get_document_ids(self, book_id: int) -> List[int]:
url = f"{self.api_url}/api/v2/repos/{book_id}/docs"
response = self.http_get(url=url)

return [document["id"] for document in response["data"]]

def get_document(self, book_id: int, document_id: int) -> Dict:
url = f"{self.api_url}/api/v2/repos/{book_id}/docs/{document_id}"
response = self.http_get(url=url)

return response["data"]

def parse_document(self, document: Dict) -> Document:
content = self.parse_document_body(document["body"])
metadata = {
"title": document["title"],
"description": document["description"],
"created_at": document["created_at"],
"updated_at": document["updated_at"],
}

return Document(page_content=content, metadata=metadata)

@staticmethod
def parse_document_body(body: str) -> str:
result = re.sub(r'<a name="(.*)"></a>', "", body)
result = re.sub(r"<br\s*/?>", "", result)

return result

def http_get(self, url: str) -> Dict:
response = requests.get(url, headers=self.headers)
response.raise_for_status()

return response.json()

def get_documents(self) -> Iterator[Document]:
user_id = self.get_user_id()
books = self.get_books(user_id)

for book in books:
book_id = book["id"]
document_ids = self.get_document_ids(book_id)
for document_id in document_ids:
document = self.get_document(book_id, document_id)
parsed_document = self.parse_document(document)
yield parsed_document

def load(self) -> List[Document]:
"""Load documents from `Yuque`."""
return list(self.get_documents())
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
"YuqueLoader",
]


Expand Down
1 change: 1 addition & 0 deletions libs/langchain/langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,4 +220,5 @@ def __getattr__(name: str) -> Any:
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
"YuqueLoader",
]
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
"YuqueLoader",
]


Expand Down

0 comments on commit fe9a8d4

Please sign in to comment.