Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community[minor]: Add DuckDB as a vectorstore #18916

Merged
merged 38 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
e29f80a
first take on integrating DuckDB
Hugoberry Mar 6, 2024
efa94cb
add integration test
Hugoberry Mar 6, 2024
270946e
Merge branch 'langchain-ai:master' into master
Hugoberry Mar 8, 2024
4f14875
fix dev container
Hugoberry Mar 8, 2024
84834fd
Merge branch 'langchain-ai:master' into master
Hugoberry Mar 11, 2024
09e691c
fix devcontainer
Hugoberry Mar 11, 2024
a7d7e9b
fix test integration
Hugoberry Mar 11, 2024
2c09905
fixing implementation
Hugoberry Mar 11, 2024
69d0119
adding more tests
Hugoberry Mar 11, 2024
f620c0a
revert dev container fix
Hugoberry Mar 11, 2024
8ebf1d1
remove extra space
Hugoberry Mar 11, 2024
840096d
Adding notebook example
Hugoberry Mar 11, 2024
76f48eb
Merge branch 'langchain-ai:master' into master
Hugoberry Mar 11, 2024
740db57
too many prints
Hugoberry Mar 11, 2024
c11fa06
Merge branch 'master' of github.com:Hugoberry/langchain
Hugoberry Mar 11, 2024
effc24f
Merge branch 'master' into master
Hugoberry Mar 11, 2024
f592dc6
fix method signatures
Hugoberry Mar 11, 2024
be8bd1b
Add security notice
Hugoberry Mar 11, 2024
76f32df
drop pandas intermediary step
Hugoberry Mar 11, 2024
ab60db0
Merge branch 'master' into master
Hugoberry Mar 11, 2024
108a3e9
fix linting errors
Hugoberry Mar 12, 2024
a5a8877
Merge branch 'master' of github.com:Hugoberry/langchain
Hugoberry Mar 12, 2024
8e65eda
Merge branch 'master' into master
Hugoberry Mar 12, 2024
88be27b
Update __init__.py
Hugoberry Mar 13, 2024
64c0abe
Merge branch 'master' into master
Hugoberry Mar 13, 2024
100b077
Merge branch 'master' into master
Hugoberry Mar 13, 2024
4310af1
Fix test for public API
Hugoberry Mar 14, 2024
de4545c
fixing linting issues
Hugoberry Mar 14, 2024
0934cbe
Merge branch 'master' into master
Hugoberry Mar 17, 2024
5b1b6d0
x
eyurtsev Mar 19, 2024
63ee953
x
eyurtsev Mar 19, 2024
ffb5e6a
resolving mypy linting errors
Hugoberry Mar 19, 2024
774123a
Merge branch 'master' into master
Hugoberry Mar 20, 2024
ef327aa
fixing mypy linting erros in tests
Hugoberry Mar 20, 2024
89e68a4
Merge branch 'master' into master
Hugoberry Mar 20, 2024
bf6e0c5
fixing more linting errors in tests
Hugoberry Mar 20, 2024
0daee72
fix formatting lint errors for test
Hugoberry Mar 22, 2024
688ef26
Merge branch 'master' into master
baskaryan Mar 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
108 changes: 108 additions & 0 deletions docs/docs/integrations/vectorstores/duckdb.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DuckDB\n",
"This notebook shows how to use `DuckDB` as a vector store."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install duckdb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import DuckDB"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
"documents = loader.load()\n",
"\n",
"documents = CharacterTextSplitter().split_documents(documents)\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docsearch = DuckDB.from_documents(documents, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(docs[0].page_content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 8 additions & 0 deletions libs/community/langchain_community/vectorstores/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ def _import_documentdb() -> Any:

return DocumentDBVectorSearch

def _import_duckdb() -> Any:
from langchain_community.vectorstores.duckdb import DuckDB

return DuckDB


def _import_elasticsearch() -> Any:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
Expand Down Expand Up @@ -581,6 +586,8 @@ def __getattr__(name: str) -> Any:
return _import_documentdb()
elif name == "DocArrayHnswSearch":
return _import_docarray_hnsw()
elif name == "DuckDB":
return _import_duckdb()
elif name == "ElasticsearchStore":
return _import_elasticsearch()
elif name == "Epsilla":
Expand Down Expand Up @@ -715,6 +722,7 @@ def __getattr__(name: str) -> Any:
"Dingo",
"DocArrayHnswSearch",
"DocArrayInMemorySearch",
"DuckDB",
"ElasticKnnSearch",
"ElasticVectorSearch",
"ElasticsearchStore",
Expand Down
192 changes: 192 additions & 0 deletions libs/community/langchain_community/vectorstores/duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
from __future__ import annotations

import uuid
import pandas as pd
import json
from typing import Any, Iterable, List, Optional

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore


class DuckDB(VectorStore):
"""`DuckDB` vector store.

This integration requires the `duckdb` Python package.
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
You can install it with `pip install duckdb`.

Args:
connection: Optional DuckDB connection. If not provided, a new connection will be created.
embedding: The embedding function or model to use for generating embeddings.
vector_key: The column name for storing vectors. Defaults to `embedding`.
id_key: The column name for storing unique identifiers. Defaults to `id`.
text_key: The column name for storing text. Defaults to `text`.
table_name: The name of the table to use for storing embeddings. Defaults to `embeddings`.

Example:
.. code-block:: python

import duckdb
conn = duckdb.connect(database=':memory:')
embedding_function = ... # Define or import your embedding function here
vector_store = DuckDB(conn, embedding_function)
vector_store.add_texts(['text1', 'text2'])
result = vector_store.similarity_search('text1')
"""

def __init__(
self,
connection: Optional[Any] = None,
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
embedding: Optional[Embeddings] = None,
vector_key: Optional[str] = "embedding",
id_key: Optional[str] = "id",
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
text_key: Optional[str] = "text",
table_name: Optional[str] = "vectorstore",
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
):
"""Initialize with DuckDB connection and setup for vector storage."""
try:
import duckdb
except ImportError:
raise ImportError(
"Could not import duckdb package. "
"Please install it with `pip install duckdb`."
)
self.duckdb = duckdb
self._embedding = embedding
self._vector_key = vector_key
self._id_key = id_key
self._text_key = text_key
self._table_name = table_name

if self._embedding is None:
raise ValueError("An embedding function or model must be provided.")

self._connection = connection or self.duckdb.connect(database=':memory:')
self._ensure_table()
self._table = self._connection.table(self._table_name)

@property
def embeddings(self) -> Optional[Embeddings]:
"""Returns the embedding object used by the vector store."""
return self._embedding

def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
) -> List[str]:
"""Turn texts into embedding and add it to the database using Pandas DataFrame

Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.

Returns:
List of ids of the added texts.
"""
# Embed texts and create documents
docs = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self._embedding.embed_documents(list(texts))
for idx, text in enumerate(texts):
embedding = embeddings[idx]
# Serialize metadata if present, else default to None
metadata = json.dumps(metadatas[idx]) if metadatas and idx < len(metadatas) else None
doc = {
self._id_key: ids[idx],
self._text_key: text,
self._vector_key: embedding,
"metadata": metadata,
}
docs.append(doc)
df = pd.DataFrame(docs)
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
print(df['embedding'])
# self._table.insert(df)
self._connection.sql(f"INSERT INTO {self._table_name} SELECT * FROM df")
return ids

def similarity_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
"""Performs a similarity search for a given query string.

Args:
query: The query string to search for.
k: The number of similar texts to return.

Returns:
A list of Documents most similar to the query.
"""
embedding = self._embedding.embed_query(query) # type: ignore
list_cosine_similarity = self.duckdb.FunctionExpression('list_cosine_similarity', self.duckdb.ColumnExpression(self._vector_key), self.duckdb.ConstantExpression(embedding))
docs = (self._table.
select(*[self.duckdb.StarExpression(),list_cosine_similarity.alias("similarity")]).
order("similarity desc").
limit(k).
select(self.duckdb.StarExpression(exclude=["similarity", self._vector_key])).
fetchdf()
)
return [
Document(
page_content=docs[self._text_key][idx],
metadata=json.loads(docs["metadata"][idx]) if docs["metadata"][idx] else {},
)
for idx in range(len(docs))
]

@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
connection: Any = None,
vector_key: Optional[str] = "vector",
id_key: Optional[str] = "id",
text_key: Optional[str] = "text",
table_name: Optional[str] = "embeddings",
**kwargs: Any,
) -> DuckDB:
"""Creates an instance of DuckDB and populates it with texts and their embeddings.

Args:
texts: List of strings to add to the vector store.
embedding: The embedding function or model to use for generating embeddings.
metadatas: Optional list of metadata dictionaries associated with the texts.
connection: DuckDB connection. If not provided, a new connection will be created.
vector_key: The column name for storing vectors. Defaults to "vector".
id_key: The column name for storing unique identifiers. Defaults to "id".
text_key: The column name for storing text. Defaults to "text".
table_name: The name of the table to use for storing embeddings. Defaults to "embeddings".

Returns:
An instance of DuckDB with the provided texts and their embeddings added.
"""
# Create an instance of DuckDB
instance = DuckDB(
Hugoberry marked this conversation as resolved.
Show resolved Hide resolved
connection,
embedding,
vector_key,
id_key,
text_key,
table_name,
)
# Add texts and their embeddings to the DuckDB vector store
instance.add_texts(texts, metadatas=metadatas, **kwargs)

return instance

def _ensure_table(self):
"""Ensures the table for storing embeddings exists."""
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {self._table_name} (
{self._id_key} VARCHAR PRIMARY KEY,
{self._text_key} VARCHAR,
{self._vector_key} FLOAT[],
metadata VARCHAR
)
"""
self._connection.execute(create_table_sql)