From 9efc29e3d18eb324ea45026295272be9515e6a98 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Tue, 29 Aug 2023 11:13:42 -0400 Subject: [PATCH 01/18] x --- libs/langchain/langchain/indexes/_api.py | 4 +-- .../langchain/indexes/_sql_record_manager.py | 25 ++++++++++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/indexes/_api.py b/libs/langchain/langchain/indexes/_api.py index 47b9d33ea..130a5c685 100644 --- a/libs/langchain/langchain/indexes/_api.py +++ b/libs/langchain/langchain/indexes/_api.py @@ -332,9 +332,9 @@ def index( uids_to_delete = record_manager.list_keys(before=index_start_dt) if uids_to_delete: - # Then delete from vector store. - vector_store.delete(uids_to_delete) # First delete from record store. + vector_store.delete(uids_to_delete) + # Then delete from record manager. record_manager.delete_keys(uids_to_delete) num_deleted = len(uids_to_delete) diff --git a/libs/langchain/langchain/indexes/_sql_record_manager.py b/libs/langchain/langchain/indexes/_sql_record_manager.py index 9cad02ef9..be793dcf5 100644 --- a/libs/langchain/langchain/indexes/_sql_record_manager.py +++ b/libs/langchain/langchain/indexes/_sql_record_manager.py @@ -15,8 +15,10 @@ allow it to work with a variety of SQL as a backend. """ import contextlib import uuid -from typing import Any, Dict, Generator, List, Optional, Sequence +from typing import Any, Dict, Generator, List, Optional, Sequence, Union +import decimal +from sqlalchemy import URL from sqlalchemy import ( Column, Engine, @@ -28,7 +30,6 @@ from sqlalchemy import ( create_engine, text, ) -from sqlalchemy.dialects.sqlite import insert from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, sessionmaker @@ -77,7 +78,7 @@ class SQLRecordManager(RecordManager): namespace: str, *, engine: Optional[Engine] = None, - db_url: Optional[str] = None, + db_url: Union[None, str, URL] = None, engine_kwargs: Optional[Dict[str, Any]] = None, ) -> None: """Initialize the SQLRecordManager. @@ -114,6 +115,7 @@ class SQLRecordManager(RecordManager): raise AssertionError("Something went wrong with configuration of engine.") self.engine = _engine + self.dialect = _engine.dialect.name self.session_factory = sessionmaker(bind=self.engine) def create_schema(self) -> None: @@ -145,8 +147,16 @@ class SQLRecordManager(RecordManager): # 2440587.5 - constant represents the Julian day number for January 1, 1970 # 86400.0 - constant represents the number of seconds # in a day (24 hours * 60 minutes * 60 seconds) - query = text("SELECT (julianday('now') - 2440587.5) * 86400.0;") + if self.dialect == "sqlite": + query = text("SELECT (julianday('now') - 2440587.5) * 86400.0;") + elif self.dialect == "postgresql": + query = text("SELECT EXTRACT (EPOCH FROM CURRENT_TIMESTAMP);") + else: + raise NotImplementedError(f"Not implemented for dialect {self.dialect}") + dt = session.execute(query).scalar() + if isinstance(dt, decimal.Decimal): + dt = float(dt) if not isinstance(dt, float): raise AssertionError(f"Unexpected type for datetime: {type(dt)}") return dt @@ -191,6 +201,13 @@ class SQLRecordManager(RecordManager): for key, group_id in zip(keys, group_ids) ] + if self.dialect == "sqlite": + from sqlalchemy.dialects.sqlite import insert + elif self.dialect == "postgresql": + from sqlalchemy.dialects.sqlite import insert + else: + raise NotImplementedError(f"Unsupported dialect {self.dialect}") + with self._make_session() as session: # Note: uses SQLite insert to make on_conflict_do_update work. # This code needs to be generalized a bit to work with more dialects. From 880bf062901889d504c7f827da244a7041faccef Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Tue, 29 Aug 2023 11:15:41 -0400 Subject: [PATCH 02/18] x --- libs/langchain/langchain/indexes/_sql_record_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/indexes/_sql_record_manager.py b/libs/langchain/langchain/indexes/_sql_record_manager.py index be793dcf5..ac7cc6a76 100644 --- a/libs/langchain/langchain/indexes/_sql_record_manager.py +++ b/libs/langchain/langchain/indexes/_sql_record_manager.py @@ -14,12 +14,12 @@ allow it to work with a variety of SQL as a backend. * Keys can be deleted. """ import contextlib +import decimal import uuid from typing import Any, Dict, Generator, List, Optional, Sequence, Union -import decimal -from sqlalchemy import URL from sqlalchemy import ( + URL, Column, Engine, Float, From 3c1547925a7b013ff3570b4988d0a89634d14dbf Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 29 Aug 2023 14:02:13 -0700 Subject: [PATCH 03/18] fix --- libs/langchain/langchain/chains/summarize/refine_prompts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/chains/summarize/refine_prompts.py b/libs/langchain/langchain/chains/summarize/refine_prompts.py index 5c67db481..a70d19661 100644 --- a/libs/langchain/langchain/chains/summarize/refine_prompts.py +++ b/libs/langchain/langchain/chains/summarize/refine_prompts.py @@ -4,12 +4,12 @@ from langchain.prompts import PromptTemplate REFINE_PROMPT_TMPL = ( "Your job is to produce a final summary\n" "We have provided an existing summary up to a certain point: {existing_answer}\n" - "We have the opportunity to refine the existing summary" + "We have the opportunity to refine the existing summary " "(only if needed) with some more context below.\n" "------------\n" "{text}\n" "------------\n" - "Given the new context, refine the original summary\n" + "Given the new context, refine the original summary.\n" "If the context isn't useful, return the original summary." ) REFINE_PROMPT = PromptTemplate( From 9f2d908316ba702d9fc8d56647590d143af69707 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 29 Aug 2023 14:16:48 -0700 Subject: [PATCH 04/18] cr --- .../question_answering/refine_prompts.py | 46 ++++++++----------- .../chains/summarize/refine_prompts.py | 29 +++++------- 2 files changed, 31 insertions(+), 44 deletions(-) diff --git a/libs/langchain/langchain/chains/question_answering/refine_prompts.py b/libs/langchain/langchain/chains/question_answering/refine_prompts.py index 87b4863d4..bec5fa4f5 100644 --- a/libs/langchain/langchain/chains/question_answering/refine_prompts.py +++ b/libs/langchain/langchain/chains/question_answering/refine_prompts.py @@ -11,7 +11,7 @@ from langchain.prompts.prompt import PromptTemplate DEFAULT_REFINE_PROMPT_TMPL = ( "The original question is as follows: {question}\n" "We have provided an existing answer: {existing_answer}\n" - "We have the opportunity to refine the existing answer" + "We have the opportunity to refine the existing answer " "(only if needed) with some more context below.\n" "------------\n" "{context_str}\n" @@ -20,12 +20,10 @@ DEFAULT_REFINE_PROMPT_TMPL = ( "answer the question. " "If the context isn't useful, return the original answer." ) -DEFAULT_REFINE_PROMPT = PromptTemplate( - input_variables=["question", "existing_answer", "context_str"], - template=DEFAULT_REFINE_PROMPT_TMPL, -) +DEFAULT_REFINE_PROMPT = PromptTemplate.from_template(DEFAULT_REFINE_PROMPT_TMPL) + refine_template = ( - "We have the opportunity to refine the existing answer" + "We have the opportunity to refine the existing answer " "(only if needed) with some more context below.\n" "------------\n" "{context_str}\n" @@ -34,12 +32,9 @@ refine_template = ( "answer the question. " "If the context isn't useful, return the original answer." ) -messages = [ - HumanMessagePromptTemplate.from_template("{question}"), - AIMessagePromptTemplate.from_template("{existing_answer}"), - HumanMessagePromptTemplate.from_template(refine_template), -] -CHAT_REFINE_PROMPT = ChatPromptTemplate.from_messages(messages) +CHAT_REFINE_PROMPT = ChatPromptTemplate.from_messages( + [("human", "{question}"), ("ai", "{existing_answer}"), ("human", "refine_template")] +) REFINE_PROMPT_SELECTOR = ConditionalPromptSelector( default_prompt=DEFAULT_REFINE_PROMPT, conditionals=[(is_chat_model, CHAT_REFINE_PROMPT)], @@ -48,28 +43,25 @@ REFINE_PROMPT_SELECTOR = ConditionalPromptSelector( DEFAULT_TEXT_QA_PROMPT_TMPL = ( "Context information is below. \n" - "---------------------\n" - "{context_str}" - "\n---------------------\n" + "------------\n" + "{context_str}\n" + "------------\n" "Given the context information and not prior knowledge, " "answer the question: {question}\n" ) -DEFAULT_TEXT_QA_PROMPT = PromptTemplate( - input_variables=["context_str", "question"], template=DEFAULT_TEXT_QA_PROMPT_TMPL -) +DEFAULT_TEXT_QA_PROMPT = PromptTemplate.from_template(DEFAULT_TEXT_QA_PROMPT_TMPL) + chat_qa_prompt_template = ( - "Context information is below. \n" - "---------------------\n" - "{context_str}" - "\n---------------------\n" + "Context information is below.\n" + "------------\n" + "{context_str}\n" + "------------\n" "Given the context information and not prior knowledge, " "answer any questions" ) -messages = [ - SystemMessagePromptTemplate.from_template(chat_qa_prompt_template), - HumanMessagePromptTemplate.from_template("{question}"), -] -CHAT_QUESTION_PROMPT = ChatPromptTemplate.from_messages(messages) +CHAT_QUESTION_PROMPT = ChatPromptTemplate.from_messages( + [("system", chat_qa_prompt_template), ("human", "{question}")] +) QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector( default_prompt=DEFAULT_TEXT_QA_PROMPT, conditionals=[(is_chat_model, CHAT_QUESTION_PROMPT)], diff --git a/libs/langchain/langchain/chains/summarize/refine_prompts.py b/libs/langchain/langchain/chains/summarize/refine_prompts.py index a70d19661..013d0919f 100644 --- a/libs/langchain/langchain/chains/summarize/refine_prompts.py +++ b/libs/langchain/langchain/chains/summarize/refine_prompts.py @@ -1,21 +1,16 @@ -# flake8: noqa from langchain.prompts import PromptTemplate -REFINE_PROMPT_TMPL = ( - "Your job is to produce a final summary\n" - "We have provided an existing summary up to a certain point: {existing_answer}\n" - "We have the opportunity to refine the existing summary " - "(only if needed) with some more context below.\n" - "------------\n" - "{text}\n" - "------------\n" - "Given the new context, refine the original summary.\n" - "If the context isn't useful, return the original summary." -) -REFINE_PROMPT = PromptTemplate( - input_variables=["existing_answer", "text"], - template=REFINE_PROMPT_TMPL, -) +REFINE_PROMPT_TMPL = """\ +Your job is to produce a final summary. +We have provided an existing summary up to a certain point: {existing_answer} +We have the opportunity to refine the existing summary (only if needed) with some more context below. +------------ +{text} +------------ +Given the new context, refine the original summary. +If the context isn't useful, return the original summary.\ +""" # noqa: E501 +REFINE_PROMPT = PromptTemplate.from_template(REFINE_PROMPT_TMPL) prompt_template = """Write a concise summary of the following: @@ -25,4 +20,4 @@ prompt_template = """Write a concise summary of the following: CONCISE SUMMARY:""" -PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"]) +PROMPT = PromptTemplate.from_template(prompt_template) From cafce9ed23fa1b7d2efbf14331e8722c277c6973 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 30 Aug 2023 09:35:00 -0400 Subject: [PATCH 05/18] x --- .../langchain/indexes/_sql_record_manager.py | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/libs/langchain/langchain/indexes/_sql_record_manager.py b/libs/langchain/langchain/indexes/_sql_record_manager.py index ac7cc6a76..ab0b487ee 100644 --- a/libs/langchain/langchain/indexes/_sql_record_manager.py +++ b/libs/langchain/langchain/indexes/_sql_record_manager.py @@ -14,12 +14,12 @@ allow it to work with a variety of SQL as a backend. * Keys can be deleted. """ import contextlib -import decimal import uuid from typing import Any, Dict, Generator, List, Optional, Sequence, Union +import decimal +from sqlalchemy import URL from sqlalchemy import ( - URL, Column, Engine, Float, @@ -201,25 +201,38 @@ class SQLRecordManager(RecordManager): for key, group_id in zip(keys, group_ids) ] - if self.dialect == "sqlite": - from sqlalchemy.dialects.sqlite import insert - elif self.dialect == "postgresql": - from sqlalchemy.dialects.sqlite import insert - else: - raise NotImplementedError(f"Unsupported dialect {self.dialect}") - with self._make_session() as session: - # Note: uses SQLite insert to make on_conflict_do_update work. - # This code needs to be generalized a bit to work with more dialects. - insert_stmt = insert(UpsertionRecord).values(records_to_upsert) - stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined] - [UpsertionRecord.key, UpsertionRecord.namespace], - set_=dict( - # attr-defined type ignore - updated_at=insert_stmt.excluded.updated_at, # type: ignore - group_id=insert_stmt.excluded.group_id, # type: ignore - ), - ) + if self.dialect == "sqlite": + from sqlalchemy.dialects.sqlite import insert + + # Note: uses SQLite insert to make on_conflict_do_update work. + # This code needs to be generalized a bit to work with more dialects. + insert_stmt = insert(UpsertionRecord).values(records_to_upsert) + stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined] + [UpsertionRecord.key, UpsertionRecord.namespace], + set_=dict( + # attr-defined type ignore + updated_at=insert_stmt.excluded.updated_at, # type: ignore + group_id=insert_stmt.excluded.group_id, # type: ignore + ), + ) + elif self.dialect == "postgresql": + from sqlalchemy.dialects.postgresql import insert + + # Note: uses SQLite insert to make on_conflict_do_update work. + # This code needs to be generalized a bit to work with more dialects. + insert_stmt = insert(UpsertionRecord).values(records_to_upsert) + stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined] + "uix_key_namespace", # Name of constraint + set_=dict( + # attr-defined type ignore + updated_at=insert_stmt.excluded.updated_at, # type: ignore + group_id=insert_stmt.excluded.group_id, # type: ignore + ), + ) + else: + raise NotImplementedError(f"Unsupported dialect {self.dialect}") + session.execute(stmt) session.commit() From e8f29be350827cbcb3f7979502160d7637091f82 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 30 Aug 2023 09:36:27 -0400 Subject: [PATCH 06/18] x --- .../langchain/indexes/_sql_record_manager.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/indexes/_sql_record_manager.py b/libs/langchain/langchain/indexes/_sql_record_manager.py index ab0b487ee..f47f4e923 100644 --- a/libs/langchain/langchain/indexes/_sql_record_manager.py +++ b/libs/langchain/langchain/indexes/_sql_record_manager.py @@ -14,12 +14,12 @@ allow it to work with a variety of SQL as a backend. * Keys can be deleted. """ import contextlib +import decimal import uuid from typing import Any, Dict, Generator, List, Optional, Sequence, Union -import decimal -from sqlalchemy import URL from sqlalchemy import ( + URL, Column, Engine, Float, @@ -203,11 +203,11 @@ class SQLRecordManager(RecordManager): with self._make_session() as session: if self.dialect == "sqlite": - from sqlalchemy.dialects.sqlite import insert + from sqlalchemy.dialects.sqlite import insert as sqlite_insert # Note: uses SQLite insert to make on_conflict_do_update work. # This code needs to be generalized a bit to work with more dialects. - insert_stmt = insert(UpsertionRecord).values(records_to_upsert) + insert_stmt = sqlite_insert(UpsertionRecord).values(records_to_upsert) stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined] [UpsertionRecord.key, UpsertionRecord.namespace], set_=dict( @@ -217,11 +217,11 @@ class SQLRecordManager(RecordManager): ), ) elif self.dialect == "postgresql": - from sqlalchemy.dialects.postgresql import insert + from sqlalchemy.dialects.postgresql import insert as pg_insert # Note: uses SQLite insert to make on_conflict_do_update work. # This code needs to be generalized a bit to work with more dialects. - insert_stmt = insert(UpsertionRecord).values(records_to_upsert) + insert_stmt = pg_insert(UpsertionRecord).values(records_to_upsert) stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined] "uix_key_namespace", # Name of constraint set_=dict( From 5b913003e0aeceaecd101322178afff78cb8f464 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 31 Aug 2023 07:27:56 -0700 Subject: [PATCH 07/18] bump --- libs/langchain/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 77899ffbb..93b78e2d3 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.277" +version = "0.0.278" description = "Building applications with LLMs through composability" authors = [] license = "MIT" From 8d66b00c730ad62d366580dbe00a1839652803ea Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:58:13 -0700 Subject: [PATCH 08/18] Data anonymizer notebook nit (#10062) --- .../privacy/presidio_data_anonymization.ipynb | 216 ++++++++---------- 1 file changed, 91 insertions(+), 125 deletions(-) diff --git a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb index 7bb0b1593..faa992925 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb @@ -28,12 +28,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install necessary packages\n", - "# ! pip install langchain langchain-experimental openai\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", "# ! python -m spacy download en_core_web_lg" ] }, @@ -47,16 +47,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Marie Santos, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'" ] }, - "execution_count": 2, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -64,6 +64,92 @@ "source": [ "from langchain_experimental.data_anonymizer import PresidioAnonymizer\n", "\n", + "anonymizer = PresidioAnonymizer()\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization with the rest of our application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set env var OPENAI_API_KEY or load from a .env file:\n", + "# import dotenv\n", + "\n", + "# dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema.runnable import RunnablePassthrough\n", + "\n", + "template = \"\"\"According to this text, where can you find our super secret data?\n", + "\n", + "{anonymized_text}\n", + "\n", + "Answer:\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI()\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customization\n", + "We can specify ``analyzed_fields`` to only anonymize particular types of data." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n", "\n", "anonymizer.anonymize(\n", @@ -75,7 +161,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\\\n", "As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:" ] }, @@ -331,125 +416,6 @@ "anonymizer.anonymize(\"My polish phone number is 666555444\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\\\n", - "Finally, it is worth showing how to implement anonymizer as a chain. Since anonymization is based on string operations, we can use `TransformChain` for this:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'text': 'You can find our super secret data at https://supersecretdata.com',\n", - " 'anonymized_text': 'You can find our super secret data at https://www.fox.org/'}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain.chains.transform import TransformChain\n", - "\n", - "anonymizer = PresidioAnonymizer()\n", - "\n", - "\n", - "def anonymize_func(inputs: dict) -> dict:\n", - " text = inputs[\"text\"]\n", - " return {\"anonymized_text\": anonymizer.anonymize(text)}\n", - "\n", - "\n", - "anonymize_chain = TransformChain(\n", - " input_variables=[\"text\"],\n", - " output_variables=[\"anonymized_text\"],\n", - " transform=anonymize_func,\n", - ")\n", - "\n", - "anonymize_chain(\"You can find our super secret data at https://supersecretdata.com\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\\\n", - "Later, you can, for example, use such anonymization as part of chain sequence. We will use `LangChain Expression Language` ([learn more here](https://python.langchain.com/docs/guides/expression_language/)) for composing these chains together, as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# ! pip install openai\n", - "\n", - "# Set env var OPENAI_API_KEY or load from a .env file:\n", - "import dotenv\n", - "\n", - "dotenv.load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'anonymized_text': StringPromptValue(text='According to this text, where can you find our super secret data?\\n\\nYou can find our super secret data at https://evans-summers.info/\\n\\nAnswer:'),\n", - " 'text': ' https://evans-summers.info/'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from operator import itemgetter\n", - "from langchain.prompts.prompt import PromptTemplate\n", - "from langchain.chains.llm import LLMChain\n", - "from langchain.llms.openai import OpenAI\n", - "\n", - "template = \"\"\"According to this text, where can you find our super secret data?\n", - "\n", - "{anonymized_text}\n", - "\n", - "Answer:\"\"\"\n", - "prompt = PromptTemplate(input_variables=[\"anonymized_text\"], template=template)\n", - "llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)\n", - "\n", - "\n", - "chain = (\n", - " anonymize_chain\n", - " | {\"anonymized_text\": itemgetter(\"anonymized_text\")}\n", - " | prompt\n", - " | llm_chain\n", - ")\n", - "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" - ] - }, { "cell_type": "markdown", "metadata": {}, From 641b71e2cd6706e90c727dcef1edff88b5992ced Mon Sep 17 00:00:00 2001 From: Zizhong Zhang Date: Thu, 31 Aug 2023 12:21:24 -0700 Subject: [PATCH 09/18] refactor: rename to OpaquePrompts (#10013) Renamed to OpaquePrompts cc @baskaryan Thanks in advance! --- ...{promptguard.ipynb => opaqueprompts.ipynb} | 36 +++++++-------- libs/langchain/langchain/llms/__init__.py | 6 +-- .../llms/{promptguard.py => opaqueprompts.py} | 46 +++++++++---------- .../{promptguard.py => opaqueprompts.py} | 18 ++++---- ...t_promptguard.py => test_opaqueprompts.py} | 14 +++--- 5 files changed, 60 insertions(+), 60 deletions(-) rename docs/extras/integrations/llms/{promptguard.ipynb => opaqueprompts.ipynb} (82%) rename libs/langchain/langchain/llms/{promptguard.py => opaqueprompts.py} (65%) rename libs/langchain/langchain/utilities/{promptguard.py => opaqueprompts.py} (83%) rename libs/langchain/tests/integration_tests/llms/{test_promptguard.py => test_opaqueprompts.py} (91%) diff --git a/docs/extras/integrations/llms/promptguard.ipynb b/docs/extras/integrations/llms/opaqueprompts.ipynb similarity index 82% rename from docs/extras/integrations/llms/promptguard.ipynb rename to docs/extras/integrations/llms/opaqueprompts.ipynb index f93244eca..132b37e8c 100644 --- a/docs/extras/integrations/llms/promptguard.ipynb +++ b/docs/extras/integrations/llms/opaqueprompts.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# PromptGuard\n", + "# OpaquePrompts\n", "\n", - "[PromptGuard](https://promptguard.readthedocs.io/en/latest/) is a service that enables applications to leverage the power of language models without compromising user privacy. Designed for composability and ease of integration into existing applications and services, PromptGuard is consumable via a simple Python library as well as through LangChain. Perhaps more importantly, PromptGuard leverages the power of [confidential computing](https://en.wikipedia.org/wiki/Confidential_computing) to ensure that even the PromptGuard service itself cannot access the data it is protecting.\n", + "[OpaquePrompts](https://opaqueprompts.readthedocs.io/en/latest/) is a service that enables applications to leverage the power of language models without compromising user privacy. Designed for composability and ease of integration into existing applications and services, OpaquePrompts is consumable via a simple Python library as well as through LangChain. Perhaps more importantly, OpaquePrompts leverages the power of [confidential computing](https://en.wikipedia.org/wiki/Confidential_computing) to ensure that even the OpaquePrompts service itself cannot access the data it is protecting.\n", " \n", "\n", - "This notebook goes over how to use LangChain to interact with `PromptGuard`." + "This notebook goes over how to use LangChain to interact with `OpaquePrompts`." ] }, { @@ -18,15 +18,15 @@ "metadata": {}, "outputs": [], "source": [ - "# install the promptguard and langchain packages\n", - "! pip install promptguard langchain" + "# install the opaqueprompts and langchain packages\n", + "! pip install opaqueprompts langchain" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Accessing the PromptGuard API requires an API key, which you can get by creating an account on [the PromptGuard website](https://promptguard.opaque.co/). Once you have an account, you can find your API key on [the API Keys page](https://promptguard.opaque.co/api-keys)." + "Accessing the OpaquePrompts API requires an API key, which you can get by creating an account on [the OpaquePrompts website](https://opaqueprompts.opaque.co/). Once you have an account, you can find your API key on [the API Keys page](https:opaqueprompts.opaque.co/api-keys)." ] }, { @@ -39,7 +39,7 @@ "\n", "# Set API keys\n", "\n", - "os.environ['PROMPTGUARD_API_KEY'] = \"\"\n", + "os.environ['OPAQUEPROMPTS_API_KEY'] = \"\"\n", "os.environ['OPENAI_API_KEY'] = \"\"" ] }, @@ -47,9 +47,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Use PromptGuard LLM Wrapper\n", + "# Use OpaquePrompts LLM Wrapper\n", "\n", - "Applying promptguard to your application could be as simple as wrapping your LLM using the PromptGuard class by replace `llm=OpenAI()` with `llm=PromptGuard(base_llm=OpenAI())`." + "Applying OpaquePrompts to your application could be as simple as wrapping your LLM using the OpaquePrompts class by replace `llm=OpenAI()` with `llm=OpaquePrompts(base_llm=OpenAI())`." ] }, { @@ -64,7 +64,7 @@ "from langchain.llms import OpenAI\n", "from langchain.memory import ConversationBufferWindowMemory\n", "\n", - "from langchain.llms import PromptGuard\n", + "from langchain.llms import OpaquePrompts\n", "\n", "langchain.verbose = True\n", "langchain.debug = True\n", @@ -106,7 +106,7 @@ "\n", "chain = LLMChain(\n", " prompt=PromptTemplate.from_template(prompt_template),\n", - " llm=PromptGuard(base_llm=OpenAI()),\n", + " llm=OpaquePrompts(base_llm=OpenAI()),\n", " memory=ConversationBufferWindowMemory(k=2),\n", " verbose=True,\n", ")\n", @@ -132,10 +132,10 @@ "During our recent meeting on February 23, 2023, at 10:30 AM, John Doe provided me with his personal details. His email is johndoe@example.com and his contact number is 650-456-7890. He lives in New York City, USA, and belongs to the American nationality with Christian beliefs and a leaning towards the Democratic party. He mentioned that he recently made a transaction using his credit card 4111 1111 1111 1111 and transferred bitcoins to the wallet address 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa. While discussing his European travels, he noted down his IBAN as GB29 NWBK 6016 1331 9268 19. Additionally, he provided his website as https://johndoeportfolio.com. John also discussed some of his US-specific details. He said his bank account number is 1234567890123456 and his drivers license is Y12345678. His ITIN is 987-65-4321, and he recently renewed his passport, the number for which is 123456789. He emphasized not to share his SSN, which is 669-45-6789. Furthermore, he mentioned that he accesses his work files remotely through the IP 192.168.1.1 and has a medical license number MED-123456.\n", "```\n", "\n", - "PromptGuard will automatically detect the sensitive data and replace it with a placeholder. \n", + "OpaquePrompts will automatically detect the sensitive data and replace it with a placeholder. \n", "\n", "```\n", - "# Context after PromptGuard\n", + "# Context after OpaquePrompts\n", "\n", "During our recent meeting on DATE_TIME_3, at DATE_TIME_2, PERSON_3 provided me with his personal details. His email is EMAIL_ADDRESS_1 and his contact number is PHONE_NUMBER_1. He lives in LOCATION_3, LOCATION_2, and belongs to the NRP_3 nationality with NRP_2 beliefs and a leaning towards the Democratic party. He mentioned that he recently made a transaction using his credit card CREDIT_CARD_1 and transferred bitcoins to the wallet address CRYPTO_1. While discussing his NRP_1 travels, he noted down his IBAN as IBAN_CODE_1. Additionally, he provided his website as URL_1. PERSON_2 also discussed some of his LOCATION_1-specific details. He said his bank account number is US_BANK_NUMBER_1 and his drivers license is US_DRIVER_LICENSE_2. His ITIN is US_ITIN_1, and he recently renewed his passport, the number for which is DATE_TIME_1. He emphasized not to share his SSN, which is US_SSN_1. Furthermore, he mentioned that he accesses his work files remotely through the IP IP_ADDRESS_1 and has a medical license number MED-US_DRIVER_LICENSE_1.\n", "```\n", @@ -151,7 +151,7 @@ "Response is desanitized by replacing the placeholder with the original sensitive data.\n", "\n", "```\n", - "# desanitized LLM response from PromptGuard\n", + "# desanitized LLM response from OpaquePrompts\n", "\n", "Hey John, just wanted to remind you to do a password reset for your website https://johndoeportfolio.com through your email johndoe@example.com. It's important to stay secure online, so don't forget to do it!\n", "```" @@ -161,7 +161,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Use PromptGuard in LangChain expression\n", + "# Use OpaquePrompts in LangChain expression\n", "\n", "There are functions that can be used with LangChain expression as well if a drop-in replacement doesn't offer the flexibility you need. " ] @@ -172,7 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "import langchain.utilities.promptguard as pgf\n", + "import langchain.utilities.opaqueprompts as op\n", "from langchain.schema.runnable import RunnableMap\n", "from langchain.schema.output_parser import StrOutputParser\n", "\n", @@ -180,7 +180,7 @@ "prompt=PromptTemplate.from_template(prompt_template), \n", "llm = OpenAI()\n", "pg_chain = (\n", - " pgf.sanitize\n", + " op.sanitize\n", " | RunnableMap(\n", " {\n", " \"response\": (lambda x: x[\"sanitized_input\"])\n", @@ -190,7 +190,7 @@ " \"secure_context\": lambda x: x[\"secure_context\"],\n", " }\n", " )\n", - " | (lambda x: pgf.desanitize(x[\"response\"], x[\"secure_context\"]))\n", + " | (lambda x: op.desanitize(x[\"response\"], x[\"secure_context\"]))\n", ")\n", "\n", "pg_chain.invoke({\"question\": \"Write a text message to remind John to do password reset for his website through his email to stay secure.\", \"history\": \"\"})" diff --git a/libs/langchain/langchain/llms/__init__.py b/libs/langchain/langchain/llms/__init__.py index d46ce1ab0..a454e10af 100644 --- a/libs/langchain/langchain/llms/__init__.py +++ b/libs/langchain/langchain/llms/__init__.py @@ -62,6 +62,7 @@ from langchain.llms.mosaicml import MosaicML from langchain.llms.nlpcloud import NLPCloud from langchain.llms.octoai_endpoint import OctoAIEndpoint from langchain.llms.ollama import Ollama +from langchain.llms.opaqueprompts import OpaquePrompts from langchain.llms.openai import AzureOpenAI, OpenAI, OpenAIChat from langchain.llms.openllm import OpenLLM from langchain.llms.openlm import OpenLM @@ -69,7 +70,6 @@ from langchain.llms.petals import Petals from langchain.llms.pipelineai import PipelineAI from langchain.llms.predibase import Predibase from langchain.llms.predictionguard import PredictionGuard -from langchain.llms.promptguard import PromptGuard from langchain.llms.promptlayer_openai import PromptLayerOpenAI, PromptLayerOpenAIChat from langchain.llms.replicate import Replicate from langchain.llms.rwkv import RWKV @@ -142,7 +142,7 @@ __all__ = [ "PredictionGuard", "PromptLayerOpenAI", "PromptLayerOpenAIChat", - "PromptGuard", + "OpaquePrompts", "RWKV", "Replicate", "SagemakerEndpoint", @@ -207,7 +207,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = { "petals": Petals, "pipelineai": PipelineAI, "predibase": Predibase, - "promptguard": PromptGuard, + "opaqueprompts": OpaquePrompts, "replicate": Replicate, "rwkv": RWKV, "sagemaker_endpoint": SagemakerEndpoint, diff --git a/libs/langchain/langchain/llms/promptguard.py b/libs/langchain/langchain/llms/opaqueprompts.py similarity index 65% rename from libs/langchain/langchain/llms/promptguard.py rename to libs/langchain/langchain/llms/opaqueprompts.py index 9dcdfcb6a..af3ccc967 100644 --- a/libs/langchain/langchain/llms/promptguard.py +++ b/libs/langchain/langchain/llms/opaqueprompts.py @@ -10,23 +10,23 @@ from langchain.utils import get_from_dict_or_env logger = logging.getLogger(__name__) -class PromptGuard(LLM): - """An LLM wrapper that uses PromptGuard to sanitize prompts. +class OpaquePrompts(LLM): + """An LLM wrapper that uses OpaquePrompts to sanitize prompts. Wraps another LLM and sanitizes prompts before passing it to the LLM, then de-sanitizes the response. - To use, you should have the ``promptguard`` python package installed, - and the environment variable ``PROMPTGUARD_API_KEY`` set with + To use, you should have the ``opaqueprompts`` python package installed, + and the environment variable ``OPAQUEPROMPTS_API_KEY`` set with your API key, or pass it as a named parameter to the constructor. Example: .. code-block:: python - from langchain.llms import PromptGuard + from langchain.llms import OpaquePrompts from langchain.chat_models import ChatOpenAI - prompt_guard_llm = PromptGuard(base_llm=ChatOpenAI()) + op_llm = OpaquePrompts(base_llm=ChatOpenAI()) """ base_llm: BaseLanguageModel @@ -39,29 +39,29 @@ class PromptGuard(LLM): @root_validator() def validate_environment(cls, values: Dict) -> Dict: - """Validates that the PromptGuard API key and the Python package exist.""" + """Validates that the OpaquePrompts API key and the Python package exist.""" try: - import promptguard as pg + import opaqueprompts as op except ImportError: raise ImportError( - "Could not import the `promptguard` Python package, " - "please install it with `pip install promptguard`." + "Could not import the `opaqueprompts` Python package, " + "please install it with `pip install opaqueprompts`." ) - if pg.__package__ is None: + if op.__package__ is None: raise ValueError( - "Could not properly import `promptguard`, " - "promptguard.__package__ is None." + "Could not properly import `opaqueprompts`, " + "opaqueprompts.__package__ is None." ) api_key = get_from_dict_or_env( - values, "promptguard_api_key", "PROMPTGUARD_API_KEY", default="" + values, "opaqueprompts_api_key", "OPAQUEPROMPTS_API_KEY", default="" ) if not api_key: raise ValueError( - "Could not find PROMPTGUARD_API_KEY in the environment. " - "Please set it to your PromptGuard API key." - "You can get it by creating an account on the PromptGuard website: " - "https://promptguard.opaque.co/ ." + "Could not find OPAQUEPROMPTS_API_KEY in the environment. " + "Please set it to your OpaquePrompts API key." + "You can get it by creating an account on the OpaquePrompts website: " + "https://opaqueprompts.opaque.co/ ." ) return values @@ -83,14 +83,14 @@ class PromptGuard(LLM): Example: .. code-block:: python - response = prompt_guard_llm("Tell me a joke.") + response = op_llm("Tell me a joke.") """ - import promptguard as pg + import opaqueprompts as op _run_manager = run_manager or CallbackManagerForLLMRun.get_noop_manager() # sanitize the prompt by replacing the sensitive information with a placeholder - sanitize_response: pg.SanitizeResponse = pg.sanitize([prompt]) + sanitize_response: op.SanitizeResponse = op.sanitize([prompt]) sanitized_prompt_value_str = sanitize_response.sanitized_texts[0] # TODO: Add in callbacks once child runs for LLMs are supported by LangSmith. @@ -101,7 +101,7 @@ class PromptGuard(LLM): ) # desanitize the response by restoring the original sensitive information - desanitize_response: pg.DesanitizeResponse = pg.desanitize( + desanitize_response: op.DesanitizeResponse = op.desanitize( llm_response, secure_context=sanitize_response.secure_context, ) @@ -113,4 +113,4 @@ class PromptGuard(LLM): This is an override of the base class method. """ - return "promptguard" + return "opaqueprompts" diff --git a/libs/langchain/langchain/utilities/promptguard.py b/libs/langchain/langchain/utilities/opaqueprompts.py similarity index 83% rename from libs/langchain/langchain/utilities/promptguard.py rename to libs/langchain/langchain/utilities/opaqueprompts.py index df29cafa4..23b02fdf2 100644 --- a/libs/langchain/langchain/utilities/promptguard.py +++ b/libs/langchain/langchain/utilities/opaqueprompts.py @@ -31,16 +31,16 @@ def sanitize( The `secure_context` needs to be passed to the `desanitize` function. """ try: - import promptguard as pg + import opaqueprompts as op except ImportError: raise ImportError( - "Could not import the `promptguard` Python package, " - "please install it with `pip install promptguard`." + "Could not import the `opaqueprompts` Python package, " + "please install it with `pip install opaqueprompts`." ) if isinstance(input, str): # the input could be a string, so we sanitize the string - sanitize_response: pg.SanitizeResponse = pg.sanitize([input]) + sanitize_response: op.SanitizeResponse = op.sanitize([input]) return { "sanitized_input": sanitize_response.sanitized_texts[0], "secure_context": sanitize_response.secure_context, @@ -55,7 +55,7 @@ def sanitize( values.append(input[key]) # sanitize the values - sanitize_values_response: pg.SanitizeResponse = pg.sanitize(values) + sanitize_values_response: op.SanitizeResponse = op.sanitize(values) # reconstruct the dict with the sanitized values sanitized_input_values = sanitize_values_response.sanitized_texts @@ -85,13 +85,13 @@ def desanitize(sanitized_text: str, secure_context: bytes) -> str: De-sanitized text. """ try: - import promptguard as pg + import opaqueprompts as op except ImportError: raise ImportError( - "Could not import the `promptguard` Python package, " - "please install it with `pip install promptguard`." + "Could not import the `opaqueprompts` Python package, " + "please install it with `pip install opaqueprompts`." ) - desanitize_response: pg.DesanitizeResponse = pg.desanitize( + desanitize_response: op.DesanitizeResponse = op.desanitize( sanitized_text, secure_context ) return desanitize_response.desanitized_text diff --git a/libs/langchain/tests/integration_tests/llms/test_promptguard.py b/libs/langchain/tests/integration_tests/llms/test_opaqueprompts.py similarity index 91% rename from libs/langchain/tests/integration_tests/llms/test_promptguard.py rename to libs/langchain/tests/integration_tests/llms/test_opaqueprompts.py index 599df595a..1a2fb604b 100644 --- a/libs/langchain/tests/integration_tests/llms/test_promptguard.py +++ b/libs/langchain/tests/integration_tests/llms/test_opaqueprompts.py @@ -1,7 +1,7 @@ -import langchain.utilities.promptguard as pgf +import langchain.utilities.opaqueprompts as op from langchain import LLMChain, PromptTemplate from langchain.llms import OpenAI -from langchain.llms.promptguard import PromptGuard +from langchain.llms.opaqueprompts import OpaquePrompts from langchain.memory import ConversationBufferWindowMemory from langchain.schema.output_parser import StrOutputParser from langchain.schema.runnable import RunnableMap @@ -42,10 +42,10 @@ Question: ```{question}``` """ -def test_promptguard() -> None: +def test_opaqueprompts() -> None: chain = LLMChain( prompt=PromptTemplate.from_template(prompt_template), - llm=PromptGuard(llm=OpenAI()), + llm=OpaquePrompts(llm=OpenAI()), memory=ConversationBufferWindowMemory(k=2), ) @@ -58,11 +58,11 @@ def test_promptguard() -> None: assert isinstance(output, str) -def test_promptguard_functions() -> None: +def test_opaqueprompts_functions() -> None: prompt = (PromptTemplate.from_template(prompt_template),) llm = OpenAI() pg_chain = ( - pgf.sanitize + op.sanitize | RunnableMap( { "response": (lambda x: x["sanitized_input"]) # type: ignore @@ -72,7 +72,7 @@ def test_promptguard_functions() -> None: "secure_context": lambda x: x["secure_context"], } ) - | (lambda x: pgf.desanitize(x["response"], x["secure_context"])) + | (lambda x: op.desanitize(x["response"], x["secure_context"])) ) pg_chain.invoke( From 74fcfed4e2bdd186c2869a07008175a9b66b1ed4 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 31 Aug 2023 15:55:29 -0400 Subject: [PATCH 10/18] lint for pydantic imports (#9937) Catch pydantic imports --- libs/langchain/Makefile | 1 + libs/langchain/scripts/check_pydantic.sh | 27 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100755 libs/langchain/scripts/check_pydantic.sh diff --git a/libs/langchain/Makefile b/libs/langchain/Makefile index c4cd64cd8..14b7ec338 100644 --- a/libs/langchain/Makefile +++ b/libs/langchain/Makefile @@ -76,6 +76,7 @@ lint format: PYTHON_FILES=. lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/langchain --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$') lint lint_diff: + ./scripts/check_pydantic.sh . poetry run ruff . poetry run black $(PYTHON_FILES) --check poetry run mypy $(PYTHON_FILES) diff --git a/libs/langchain/scripts/check_pydantic.sh b/libs/langchain/scripts/check_pydantic.sh new file mode 100755 index 000000000..7c2d9c5c0 --- /dev/null +++ b/libs/langchain/scripts/check_pydantic.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# +# This script searches for lines starting with "import pydantic" or "from pydantic" +# in tracked files within a Git repository. +# +# Usage: ./scripts/check_pydantic.sh /path/to/repository + +# Check if a path argument is provided +if [ $# -ne 1 ]; then + echo "Usage: $0 /path/to/repository" + exit 1 +fi + +repository_path="$1" + +# Search for lines matching the pattern within the specified repository +result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic') + +# Check if any matching lines were found +if [ -n "$result" ]; then + echo "ERROR: The following lines need to be updated:" + echo "$result" + echo "Please replace the code with an import from langchain.pydantic_v1." + echo "For example, replace 'from pydantic import BaseModel'" + echo "with 'from langchain.pydantic_v1 import BaseModel'" + exit 1 +fi From 02e51f4217207eed4fc9ac89735cf1f660be3f10 Mon Sep 17 00:00:00 2001 From: Robert Perrotta <104582251+robert-perrotta@users.noreply.github.com> Date: Thu, 31 Aug 2023 18:25:59 -0400 Subject: [PATCH 11/18] update_forward_refs for Run (#9969) Adds a call to Pydantic's `update_forward_refs` for the `Run` class (in addition to the `ChainRun` and `ToolRun` classes, for which that method is already called). Without it, the self-reference of child classes (type `List[Run]`) is problematic. For example: ```python from langchain.callbacks import StdOutCallbackHandler from langchain.chains import LLMChain from langchain.llms import OpenAI from langchain.prompts import PromptTemplate from wandb.integration.langchain import WandbTracer llm = OpenAI() prompt = PromptTemplate.from_template("1 + {number} = ") chain = LLMChain(llm=llm, prompt=prompt, callbacks=[StdOutCallbackHandler(), WandbTracer()]) print(chain.run(number=2)) ``` results in the following output before the change ``` WARNING:root:Error in on_chain_start callback: field "child_runs" not yet prepared so type is still a ForwardRef, you might need to call Run.update_forward_refs(). > Entering new LLMChain chain... Prompt after formatting: 1 + 2 = WARNING:root:Error in on_chain_end callback: No chain Run found to be traced > Finished chain. 3 ``` but afterwards the callback error messages are gone. --- libs/langchain/langchain/callbacks/tracers/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/langchain/langchain/callbacks/tracers/schemas.py b/libs/langchain/langchain/callbacks/tracers/schemas.py index 37f33763f..b08b454a9 100644 --- a/libs/langchain/langchain/callbacks/tracers/schemas.py +++ b/libs/langchain/langchain/callbacks/tracers/schemas.py @@ -120,6 +120,7 @@ class Run(BaseRunV2): ChainRun.update_forward_refs() ToolRun.update_forward_refs() +Run.update_forward_refs() __all__ = [ "BaseRun", From 86646ec555970e01130994dc75f3a0c5d4e52de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 1 Sep 2023 00:47:44 +0200 Subject: [PATCH 12/18] feat: Add `ApifyWrapper` class (#10067) If you look at documentation https://python.langchain.com/docs/integrations/tools/apify (or the actual file https://github.com/langchain-ai/langchain/blob/master/docs/extras/integrations/tools/apify.ipynb ), there's a class `ApifyWrapper` mentioned. It seems it got lost in some refactoring, i.e. it does not exist in the codebase ATM. I just propose to add it back. It would fix issues e.g. https://github.com/langchain-ai/langchain/issues/8307 or https://github.com/langchain-ai/langchain/issues/8201 To add, Apify is a wanted integration, e.g. see https://twitter.com/hwchase17/status/1695490295914545626 or https://twitter.com/hwchase17/status/1695470765343461756 Lastly, I offer taking ownership of the Apify-related parts of the codebase, so you can tag me if anything is needed. --- .../langchain/langchain/utilities/__init__.py | 2 + libs/langchain/langchain/utilities/apify.py | 194 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 libs/langchain/langchain/utilities/apify.py diff --git a/libs/langchain/langchain/utilities/__init__.py b/libs/langchain/langchain/utilities/__init__.py index 3365c6044..9f7ebc7dc 100644 --- a/libs/langchain/langchain/utilities/__init__.py +++ b/libs/langchain/langchain/utilities/__init__.py @@ -4,6 +4,7 @@ Other LangChain classes use **Utilities** to interact with third-part systems and packages. """ from langchain.utilities.alpha_vantage import AlphaVantageAPIWrapper +from langchain.utilities.apify import ApifyWrapper from langchain.utilities.arxiv import ArxivAPIWrapper from langchain.utilities.awslambda import LambdaWrapper from langchain.utilities.bash import BashProcess @@ -38,6 +39,7 @@ from langchain.utilities.zapier import ZapierNLAWrapper __all__ = [ "AlphaVantageAPIWrapper", + "ApifyWrapper", "ArxivAPIWrapper", "BashProcess", "BibtexparserWrapper", diff --git a/libs/langchain/langchain/utilities/apify.py b/libs/langchain/langchain/utilities/apify.py new file mode 100644 index 000000000..dd7ddcd01 --- /dev/null +++ b/libs/langchain/langchain/utilities/apify.py @@ -0,0 +1,194 @@ +from typing import Any, Callable, Dict, Optional + +from langchain.document_loaders import ApifyDatasetLoader +from langchain.document_loaders.base import Document +from langchain.pydantic_v1 import BaseModel, root_validator +from langchain.utils import get_from_dict_or_env + + +class ApifyWrapper(BaseModel): + """Wrapper around Apify. + To use, you should have the ``apify-client`` python package installed, + and the environment variable ``APIFY_API_TOKEN`` set with your API key, or pass + `apify_api_token` as a named parameter to the constructor. + """ + + apify_client: Any + apify_client_async: Any + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate environment. + Validate that an Apify API token is set and the apify-client + Python package exists in the current environment. + """ + apify_api_token = get_from_dict_or_env( + values, "apify_api_token", "APIFY_API_TOKEN" + ) + + try: + from apify_client import ApifyClient, ApifyClientAsync + + values["apify_client"] = ApifyClient(apify_api_token) + values["apify_client_async"] = ApifyClientAsync(apify_api_token) + except ImportError: + raise ValueError( + "Could not import apify-client Python package. " + "Please install it with `pip install apify-client`." + ) + + return values + + def call_actor( + self, + actor_id: str, + run_input: Dict, + dataset_mapping_function: Callable[[Dict], Document], + *, + build: Optional[str] = None, + memory_mbytes: Optional[int] = None, + timeout_secs: Optional[int] = None, + ) -> ApifyDatasetLoader: + """Run an Actor on the Apify platform and wait for results to be ready. + Args: + actor_id (str): The ID or name of the Actor on the Apify platform. + run_input (Dict): The input object of the Actor that you're trying to run. + dataset_mapping_function (Callable): A function that takes a single + dictionary (an Apify dataset item) and converts it to an + instance of the Document class. + build (str, optional): Optionally specifies the actor build to run. + It can be either a build tag or build number. + memory_mbytes (int, optional): Optional memory limit for the run, + in megabytes. + timeout_secs (int, optional): Optional timeout for the run, in seconds. + Returns: + ApifyDatasetLoader: A loader that will fetch the records from the + Actor run's default dataset. + """ + actor_call = self.apify_client.actor(actor_id).call( + run_input=run_input, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=timeout_secs, + ) + + return ApifyDatasetLoader( + dataset_id=actor_call["defaultDatasetId"], + dataset_mapping_function=dataset_mapping_function, + ) + + async def acall_actor( + self, + actor_id: str, + run_input: Dict, + dataset_mapping_function: Callable[[Dict], Document], + *, + build: Optional[str] = None, + memory_mbytes: Optional[int] = None, + timeout_secs: Optional[int] = None, + ) -> ApifyDatasetLoader: + """Run an Actor on the Apify platform and wait for results to be ready. + Args: + actor_id (str): The ID or name of the Actor on the Apify platform. + run_input (Dict): The input object of the Actor that you're trying to run. + dataset_mapping_function (Callable): A function that takes a single + dictionary (an Apify dataset item) and converts it to + an instance of the Document class. + build (str, optional): Optionally specifies the actor build to run. + It can be either a build tag or build number. + memory_mbytes (int, optional): Optional memory limit for the run, + in megabytes. + timeout_secs (int, optional): Optional timeout for the run, in seconds. + Returns: + ApifyDatasetLoader: A loader that will fetch the records from the + Actor run's default dataset. + """ + actor_call = await self.apify_client_async.actor(actor_id).call( + run_input=run_input, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=timeout_secs, + ) + + return ApifyDatasetLoader( + dataset_id=actor_call["defaultDatasetId"], + dataset_mapping_function=dataset_mapping_function, + ) + + def call_actor_task( + self, + task_id: str, + task_input: Dict, + dataset_mapping_function: Callable[[Dict], Document], + *, + build: Optional[str] = None, + memory_mbytes: Optional[int] = None, + timeout_secs: Optional[int] = None, + ) -> ApifyDatasetLoader: + """Run a saved Actor task on Apify and wait for results to be ready. + Args: + task_id (str): The ID or name of the task on the Apify platform. + task_input (Dict): The input object of the task that you're trying to run. + Overrides the task's saved input. + dataset_mapping_function (Callable): A function that takes a single + dictionary (an Apify dataset item) and converts it to an + instance of the Document class. + build (str, optional): Optionally specifies the actor build to run. + It can be either a build tag or build number. + memory_mbytes (int, optional): Optional memory limit for the run, + in megabytes. + timeout_secs (int, optional): Optional timeout for the run, in seconds. + Returns: + ApifyDatasetLoader: A loader that will fetch the records from the + task run's default dataset. + """ + task_call = self.apify_client.task(task_id).call( + task_input=task_input, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=timeout_secs, + ) + + return ApifyDatasetLoader( + dataset_id=task_call["defaultDatasetId"], + dataset_mapping_function=dataset_mapping_function, + ) + + async def acall_actor_task( + self, + task_id: str, + task_input: Dict, + dataset_mapping_function: Callable[[Dict], Document], + *, + build: Optional[str] = None, + memory_mbytes: Optional[int] = None, + timeout_secs: Optional[int] = None, + ) -> ApifyDatasetLoader: + """Run a saved Actor task on Apify and wait for results to be ready. + Args: + task_id (str): The ID or name of the task on the Apify platform. + task_input (Dict): The input object of the task that you're trying to run. + Overrides the task's saved input. + dataset_mapping_function (Callable): A function that takes a single + dictionary (an Apify dataset item) and converts it to an + instance of the Document class. + build (str, optional): Optionally specifies the actor build to run. + It can be either a build tag or build number. + memory_mbytes (int, optional): Optional memory limit for the run, + in megabytes. + timeout_secs (int, optional): Optional timeout for the run, in seconds. + Returns: + ApifyDatasetLoader: A loader that will fetch the records from the + task run's default dataset. + """ + task_call = await self.apify_client_async.task(task_id).call( + task_input=task_input, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=timeout_secs, + ) + + return ApifyDatasetLoader( + dataset_id=task_call["defaultDatasetId"], + dataset_mapping_function=dataset_mapping_function, + ) From cc6a20d3e6fd27953f8d9bde3d928ae709662b54 Mon Sep 17 00:00:00 2001 From: Jon Bennion <120141355+j-space-b@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:05:18 -0700 Subject: [PATCH 13/18] updated prompt name in documentation for sequential chain (#10048) Description: updated the prompt name in a sequential chain example so that it is not overwritten by the same prompt name in the next chain (this is a sequential chain example) Issue: n/a Dependencies: none Tag maintainer: not known Twitter handle: not on twitter, feel free to use my git username for anything --- .../chains/foundational/sequential_chains.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/snippets/modules/chains/foundational/sequential_chains.mdx b/docs/snippets/modules/chains/foundational/sequential_chains.mdx index 977470681..d8a22bebf 100644 --- a/docs/snippets/modules/chains/foundational/sequential_chains.mdx +++ b/docs/snippets/modules/chains/foundational/sequential_chains.mdx @@ -8,12 +8,12 @@ from langchain.prompts import PromptTemplate ```python # This is an LLMChain to write a synopsis given a title of a play. llm = OpenAI(temperature=.7) -template = """You are a playwright. Given the title of play, it is your job to write a synopsis for that title. +synopsis_template = """You are a playwright. Given the title of play, it is your job to write a synopsis for that title. Title: {title} Playwright: This is a synopsis for the above play:""" -prompt_template = PromptTemplate(input_variables=["title"], template=template) -synopsis_chain = LLMChain(llm=llm, prompt=prompt_template) +synopsis_prompt_template = PromptTemplate(input_variables=["title"], template=synopsis_template) +synopsis_chain = LLMChain(llm=llm, prompt=synopsis_prompt_template) ``` @@ -95,13 +95,13 @@ Of particular importance is how we name the input/output variable names. In the ```python # This is an LLMChain to write a synopsis given a title of a play and the era it is set in. llm = OpenAI(temperature=.7) -template = """You are a playwright. Given the title of play and the era it is set in, it is your job to write a synopsis for that title. +synopsis_template = """You are a playwright. Given the title of play and the era it is set in, it is your job to write a synopsis for that title. Title: {title} Era: {era} Playwright: This is a synopsis for the above play:""" -prompt_template = PromptTemplate(input_variables=["title", "era"], template=template) -synopsis_chain = LLMChain(llm=llm, prompt=prompt_template, output_key="synopsis") +synopsis_prompt_template = PromptTemplate(input_variables=["title", "era"], template=synopsis_template) +synopsis_chain = LLMChain(llm=llm, prompt=synopsis_prompt_template, output_key="synopsis") ``` From c710c7303fc221b836c7495e3b8b1c2bacb260fc Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Fri, 1 Sep 2023 01:05:46 +0200 Subject: [PATCH 14/18] fix wrong import line in cassandra doc page for vector store (#10041) This fixes the exampe import line in the general "cassandra" doc page mdx file. (it was erroneously a copy of the chat message history import statement found below). --- docs/extras/integrations/providers/cassandra.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/integrations/providers/cassandra.mdx b/docs/extras/integrations/providers/cassandra.mdx index 3ab57a83d..430df6108 100644 --- a/docs/extras/integrations/providers/cassandra.mdx +++ b/docs/extras/integrations/providers/cassandra.mdx @@ -21,7 +21,7 @@ pip install cassio See a [usage example](/docs/integrations/vectorstores/cassandra). ```python -from langchain.memory import CassandraChatMessageHistory +from langchain.vectorstores import Cassandra ``` From 566ce06f4a7b3d7c02696b4ad1e5c4032e501856 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 31 Aug 2023 16:52:05 -0700 Subject: [PATCH 15/18] add async support for tools (#10058) --- libs/langchain/langchain/tools/base.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/tools/base.py b/libs/langchain/langchain/tools/base.py index 5d59f7959..9ad81033d 100644 --- a/libs/langchain/langchain/tools/base.py +++ b/libs/langchain/langchain/tools/base.py @@ -273,7 +273,11 @@ class ChildTool(BaseTool): Add run_manager: Optional[AsyncCallbackManagerForToolRun] = None to child implementations to enable tracing, """ - raise NotImplementedError() + return await asyncio.get_running_loop().run_in_executor( + None, + partial(self._run, **kwargs), + *args, + ) def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]: # For backwards compatibility, if run_input is a string, @@ -522,7 +526,10 @@ class Tool(BaseTool): if new_argument_supported else await self.coroutine(*args, **kwargs) ) - raise NotImplementedError("Tool does not support async") + else: + return await asyncio.get_running_loop().run_in_executor( + None, partial(self._run, run_manager=run_manager, **kwargs), *args + ) # TODO: this is for backwards compatibility, remove in future def __init__( @@ -634,7 +641,12 @@ class StructuredTool(BaseTool): if new_argument_supported else await self.coroutine(*args, **kwargs) ) - raise NotImplementedError("Tool does not support async") + return await asyncio.get_running_loop().run_in_executor( + None, + self._run, + partial(self._run, run_manager=run_manager, **kwargs), + *args, + ) @classmethod def from_function( From ad9e242a7a2284d6435ec3ae28b44942deee7e9d Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 31 Aug 2023 16:52:28 -0700 Subject: [PATCH 16/18] add snippet for max concurrency (#9892) --- .../expression_language/interface.ipynb | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/docs/extras/guides/expression_language/interface.ipynb b/docs/extras/guides/expression_language/interface.ipynb index 98a1a3860..cf19bfe4d 100644 --- a/docs/extras/guides/expression_language/interface.ipynb +++ b/docs/extras/guides/expression_language/interface.ipynb @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "d1850a1f", "metadata": {}, "outputs": [], @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "56d0669f", "metadata": {}, "outputs": [], @@ -170,6 +170,36 @@ "chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}])" ] }, + { + "cell_type": "markdown", + "id": "2434ab15", + "metadata": {}, + "source": [ + "You can set the number of concurrent requests by using the `max_concurrency` parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a08522f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False),\n", + " AIMessage(content=\"Why don't cats play poker in the wild?\\n\\nToo many cheetahs!\", additional_kwargs={}, example=False)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}], config={\"max_concurrency\": 5})" + ] + }, { "cell_type": "markdown", "id": "b960cbfe", From 3f8f3de28e9e33cbf0889111a0362116de7a8928 Mon Sep 17 00:00:00 2001 From: Davide Menini <48685774+dmenini@users.noreply.github.com> Date: Fri, 1 Sep 2023 02:11:52 +0200 Subject: [PATCH 17/18] fix (parsers/json): do not escape double quotes if already escaped (#9916) This PR fixes an issues I found when upgrading to a more recent version of Langchain. I was using 0.0.142 before, and this issue popped up already when the `_custom_parser` was added to `output_parsers/json`. Anyway, the issue is that the parser tries to escape quotes when they are double-escaped (e.g. `\\"`), leading to OutputParserException. This is particularly undesired in my app, because I have an Agent that uses a single input Tool, which expects as input a JSON string with the structure: ```python { "foo": string, "bar": string } ``` The LLM (GPT3.5) response is (almost) always something like `"action_input": "{\\"foo\\": \\"bar\\", \\"bar\\": \\"foo\\"}"` and since the upgrade this is not correctly parsed. --------- Co-authored-by: taamedag --- .../langchain/output_parsers/json.py | 2 +- .../unit_tests/output_parsers/test_json.py | 52 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/output_parsers/json.py b/libs/langchain/langchain/output_parsers/json.py index f0f653eca..7465aba2f 100644 --- a/libs/langchain/langchain/output_parsers/json.py +++ b/libs/langchain/langchain/output_parsers/json.py @@ -13,7 +13,7 @@ def _replace_new_line(match: re.Match[str]) -> str: value = re.sub(r"\n", r"\\n", value) value = re.sub(r"\r", r"\\r", value) value = re.sub(r"\t", r"\\t", value) - value = re.sub('"', r"\"", value) + value = re.sub(r'(? None: "action": "Final Answer", "action_input": '```bar\n
\n\ttext\n
```', } + + +TEST_CASES_ESCAPED_QUOTES = [ + JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON, + JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON, + JSON_WITH_ESCAPED_DOUBLE_QUOTES_IN_NESTED_JSON, +] + + +@pytest.mark.parametrize("json_string", TEST_CASES_ESCAPED_QUOTES) +def test_parse_nested_json_with_escaped_quotes(json_string: str) -> None: + parsed = parse_json_markdown(json_string) + assert parsed == { + "action": "Final Answer", + "action_input": '{"foo": "bar", "bar": "foo"}', + } + + +def test_parse_json_with_python_dict() -> None: + parsed = parse_json_markdown(JSON_WITH_PYTHON_DICT) + assert parsed == { + "action": "Final Answer", + "action_input": {"foo": "bar", "bar": "foo"}, + } From 324c86acd5be9bc9d5b6dd248d686bdbb2c11cdc Mon Sep 17 00:00:00 2001 From: jmhayes3 <22490346+jmhayes3@users.noreply.github.com> Date: Fri, 1 Sep 2023 00:19:03 -0500 Subject: [PATCH 18/18] fix typo in web_research.py (#10076) fix spelling --- libs/langchain/langchain/retrievers/web_research.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/retrievers/web_research.py b/libs/langchain/langchain/retrievers/web_research.py index 30d9c04cb..e8e96a2ab 100644 --- a/libs/langchain/langchain/retrievers/web_research.py +++ b/libs/langchain/langchain/retrievers/web_research.py @@ -150,7 +150,7 @@ class WebResearchRetriever(BaseRetriever): return query.strip() def search_tool(self, query: str, num_search_results: int = 1) -> List[dict]: - """Returns num_serch_results pages per Google search.""" + """Returns num_search_results pages per Google search.""" query_clean = self.clean_search_query(query) result = self.search.results(query_clean, num_search_results) return result