From de3322609edd1ebbd0cd793e86083852544c5d4a Mon Sep 17 00:00:00 2001 From: IlyaKIS1 <63134180+IlyaKIS1@users.noreply.github.com> Date: Mon, 4 Sep 2023 03:16:18 -0400 Subject: [PATCH] Implemented Milvus translator for self-querying (#10162) - Implemented the MilvusTranslator for self-querying using Milvus vector store - Made unit tests to test its functionality - Documented the Milvus self-querying --- .../self_query/milvus_self_query.ipynb | 375 ++++++++++++++++++ .../langchain/retrievers/self_query/base.py | 3 + .../langchain/retrievers/self_query/milvus.py | 83 ++++ .../retrievers/self_query/test_milvus.py | 116 ++++++ 4 files changed, 577 insertions(+) create mode 100644 docs/extras/modules/data_connection/retrievers/self_query/milvus_self_query.ipynb create mode 100644 libs/langchain/langchain/retrievers/self_query/milvus.py create mode 100644 libs/langchain/tests/unit_tests/retrievers/self_query/test_milvus.py diff --git a/docs/extras/modules/data_connection/retrievers/self_query/milvus_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/milvus_self_query.ipynb new file mode 100644 index 000000000..068495eef --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/milvus_self_query.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Self-querying with Milvus\n", + "\n", + "In the walkthrough we'll demo the `SelfQueryRetriever` with a `Milvus` vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a Milvus vectorstore\n", + "First we'll want to create a Milvus VectorStore and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n", + "\n", + "I have used the cloud version of Milvus, thus I need `uri` and `token` as well.\n", + "\n", + "NOTE: The self-query retriever requires you to have `lark` installed (`pip install lark`). We also need the `pymilvus` package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install lark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install pymilvus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "OPENAI_API_KEY = \"Use your OpenAI key:)\"\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import Milvus\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = [\n", + " Document(page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\", metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"action\"}),\n", + " Document(page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\", metadata={\"year\": 2010,\"genre\": \"thriller\", \"rating\": 8.2}),\n", + " Document(page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\", metadata={\"year\": 2019, \"rating\": 8.3, \"genre\": \"drama\"}),\n", + " Document(page_content=\"Three men walk into the Zone, three men walk out of the Zone\", metadata={\"year\": 1979, \"rating\": 9.9, \"genre\": \"science fiction\"}),\n", + " Document(\n", + " page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea',\n", + " metadata={\"year\": 2006, \"genre\": \"thriller\", 'rating': 9.0},\n", + " ),\n", + " Document(page_content=\"Toys come alive and have a blast doing so\", metadata={\"year\": 1995, \"genre\": \"animated\", \"rating\": 9.3 }),\n", + "]\n", + "\n", + "vector_store = Milvus.from_documents(\n", + " docs,\n", + " embedding=embeddings,\n", + " connection_args={\"uri\": 'Use your uri:)', \"token\":'Use your token:)'}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vector_store, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'action'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'rating': 9.3, 'genre': 'animated'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'genre': 'science fiction'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 9.0, 'genre': 'thriller'})]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=9) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'rating': 9.3, 'genre': 'animated'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'genre': 'science fiction'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a filter\n", + "retriever.get_relevant_documents(\"What are some highly rated movies (above 9)?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Comparison(comparator=, attribute='rating', value=9) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'rating': 9.3, 'genre': 'animated'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'genre': 'science fiction'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a query and a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie about toys rated higher than 9\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='genre', value='thriller'), Comparison(comparator=, attribute='rating', value=9)]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 9.0, 'genre': 'thriller'})]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\"What's a highly rated (above or equal 9) thriller film?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='action')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'action'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before 2005 that's all about dinosaurs, \\\n", + " and preferably has a lot of action\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, \n", + " vector_store, \n", + " document_content_description, \n", + " metadata_field_info, \n", + " verbose=True,\n", + " enable_limit=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'action'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'rating': 9.3, 'genre': 'animated'})]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are two movies about dinosaurs?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain/langchain/retrievers/self_query/base.py b/libs/langchain/langchain/retrievers/self_query/base.py index 9c64b3c44..0251bff52 100644 --- a/libs/langchain/langchain/retrievers/self_query/base.py +++ b/libs/langchain/langchain/retrievers/self_query/base.py @@ -12,6 +12,7 @@ from langchain.retrievers.self_query.chroma import ChromaTranslator from langchain.retrievers.self_query.dashvector import DashvectorTranslator from langchain.retrievers.self_query.deeplake import DeepLakeTranslator from langchain.retrievers.self_query.elasticsearch import ElasticsearchTranslator +from langchain.retrievers.self_query.milvus import MilvusTranslator from langchain.retrievers.self_query.myscale import MyScaleTranslator from langchain.retrievers.self_query.pinecone import PineconeTranslator from langchain.retrievers.self_query.qdrant import QdrantTranslator @@ -23,6 +24,7 @@ from langchain.vectorstores import ( DashVector, DeepLake, ElasticsearchStore, + Milvus, MyScale, Pinecone, Qdrant, @@ -43,6 +45,7 @@ def _get_builtin_translator(vectorstore: VectorStore) -> Visitor: MyScale: MyScaleTranslator, DeepLake: DeepLakeTranslator, ElasticsearchStore: ElasticsearchTranslator, + Milvus: MilvusTranslator, } if vectorstore_cls not in BUILTIN_TRANSLATORS: raise ValueError( diff --git a/libs/langchain/langchain/retrievers/self_query/milvus.py b/libs/langchain/langchain/retrievers/self_query/milvus.py new file mode 100644 index 000000000..2b4af4500 --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/milvus.py @@ -0,0 +1,83 @@ +"""Logic for converting internal query language to a valid Milvus query.""" +from typing import Tuple, Union + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + +COMPARATOR_TO_BER = { + Comparator.EQ: "==", + Comparator.GT: ">", + Comparator.GTE: ">=", + Comparator.LT: "<", + Comparator.LTE: "<=", +} + +UNARY_OPERATORS = [Operator.NOT] + + +def process_value(value: Union[int, float, str]) -> str: + # required for comparators involving strings + if isinstance(value, str): + # If the value is already a string, add double quotes + return f'"{value}"' + else: + # If the valueis not a string, convert it to a string without double quotes + return str(value) + + +class MilvusTranslator(Visitor): + """Translate Milvus internal query language elements to valid filters.""" + + """Subset of allowed logical operators.""" + allowed_operators = [Operator.AND, Operator.NOT, Operator.OR] + + """Subset of allowed logical comparators.""" + allowed_comparators = [ + Comparator.EQ, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + ] + + def _format_func(self, func: Union[Operator, Comparator]) -> str: + self._validate_func(func) + value = func.value + if isinstance(func, Comparator): + value = COMPARATOR_TO_BER[func] + return f"{value}" + + def visit_operation(self, operation: Operation) -> str: + if operation.operator in UNARY_OPERATORS and len(operation.arguments) == 1: + operator = self._format_func(operation.operator) + return operator + "(" + operation.arguments[0].accept(self) + ")" + elif operation.operator in UNARY_OPERATORS: + raise ValueError( + f'"{operation.operator.value}" can have only one argument in Milvus' + ) + else: + args = [arg.accept(self) for arg in operation.arguments] + operator = self._format_func(operation.operator) + return "(" + (" " + operator + " ").join(args) + ")" + + def visit_comparison(self, comparison: Comparison) -> str: + comparator = self._format_func(comparison.comparator) + processed_value = process_value(comparison.value) + attribute = comparison.attribute + + return "( " + attribute + " " + comparator + " " + processed_value + " )" + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, dict]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"expr": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_milvus.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_milvus.py new file mode 100644 index 000000000..d44497440 --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_milvus.py @@ -0,0 +1,116 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.milvus import MilvusTranslator + +DEFAULT_TRANSLATOR = MilvusTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=4) + expected = "( foo < 4 )" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + + assert expected == actual + + +def test_visit_operation() -> None: + # Non-Unary operator + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value="4"), + ], + ) + + expected = '(( foo < 2 ) and ( bar == "baz" ) ' 'and ( abc < "4" ))' + actual = DEFAULT_TRANSLATOR.visit_operation(op) + + assert expected == actual + + # Unary operator: normal execution + op = Operation( + operator=Operator.NOT, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + ], + ) + + expected = "not(( foo < 2 ))" + actual = DEFAULT_TRANSLATOR.visit_operation(op) + + assert expected == actual + + # Unary operator: error + op = Operation( + operator=Operator.NOT, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value="4"), + ], + ) + + try: + DEFAULT_TRANSLATOR.visit_operation(op) + except ValueError as e: + assert str(e) == '"not" can have only one argument in Milvus' + else: + assert False, "Expected exception not raised" # No exception -> test failed + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + ) + expected: Tuple[str, Dict] = (query, {}) + + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=454) + structured_query = StructuredQuery( + query=query, + filter=comp, + ) + + expected = ( + query, + {"expr": "( foo < 454 )"}, + ) + + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=50), + ], + ) + + structured_query = StructuredQuery( + query=query, + filter=op, + ) + + expected = ( + query, + {"expr": "(( foo < 2 ) " 'and ( bar == "baz" ) ' "and ( abc < 50 ))"}, + ) + + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual