{ "cells": [ { "cell_type": "markdown", "id": "ec1d7a9a", "metadata": {}, "source": [ "# Document Comparison\n", "\n", "This notebook shows how to use an agent to compare two documents.\n", "\n", "The high level idea is we will create a question-answering chain for each document, and then use that " ] }, { "cell_type": "code", "execution_count": 1, "id": "8632a37c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n", " warnings.warn(\n" ] } ], "source": [ "from pydantic import BaseModel, Field\n", "\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain.agents import Tool\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import FAISS\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.chains import RetrievalQA" ] }, { "cell_type": "code", "execution_count": 2, "id": "64f19917", "metadata": {}, "outputs": [], "source": [ "class DocumentInput(BaseModel):\n", " question: str = Field()\n", "\n", "\n", "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n", "\n", "tools = []\n", "files = [\n", " # https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf\n", " {\n", " \"name\": \"alphabet-earnings\", \n", " \"path\": \"/Users/harrisonchase/Downloads/2023Q1_alphabet_earnings_release.pdf\",\n", " }, \n", " # https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update\n", " {\n", " \"name\": \"tesla-earnings\", \n", " \"path\": \"/Users/harrisonchase/Downloads/TSLA-Q1-2023-Update.pdf\"\n", " }\n", "]\n", "\n", "for file in files:\n", " loader = PyPDFLoader(file[\"path\"])\n", " pages = loader.load_and_split()\n", " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", " docs = text_splitter.split_documents(pages)\n", " embeddings = OpenAIEmbeddings()\n", " retriever = FAISS.from_documents(docs, embeddings).as_retriever()\n", " \n", " # Wrap retrievers in a Tool\n", " tools.append(\n", " Tool(\n", " args_schema=DocumentInput,\n", " name=file[\"name\"], \n", " description=f\"useful when you want to answer questions about {file['name']}\",\n", " func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever)\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 3, "id": "eca02549", "metadata": {}, "outputs": [], "source": [ "from langchain.agents import initialize_agent\n", "from langchain.agents import AgentType" ] }, { "cell_type": "code", "execution_count": 4, "id": "c4d56c25", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\u001b[1m> Entering new chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m\n", "Invoking: `alphabet-earnings` with `{'question': 'revenue'}`\n", "\n", "\n", "\u001b[0m\u001b[36;1m\u001b[1;3m{'query': 'revenue', 'result': 'The revenue for Alphabet Inc. in the first quarter of 2023 was $69,787 million.'}\u001b[0m\u001b[32;1m\u001b[1;3m\n", "Invoking: `tesla-earnings` with `{'question': 'revenue'}`\n", "\n", "\n", "\u001b[0m\u001b[33;1m\u001b[1;3m{'query': 'revenue', 'result': 'Total revenue for Q1-2023 was $23.3 billion.'}\u001b[0m\u001b[32;1m\u001b[1;3mAlphabet Inc. had more revenue than Tesla. In the first quarter of 2023, Alphabet Inc. had a revenue of $69,787 million, while Tesla had a revenue of $23.3 billion.\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] }, { "data": { "text/plain": [ "{'input': 'did alphabet or tesla have more revenue?',\n", " 'output': 'Alphabet Inc. had more revenue than Tesla. In the first quarter of 2023, Alphabet Inc. had a revenue of $69,787 million, while Tesla had a revenue of $23.3 billion.'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "llm = ChatOpenAI(\n", " temperature=0,\n", " model=\"gpt-3.5-turbo-0613\", \n", ")\n", "\n", "agent = initialize_agent(\n", " agent=AgentType.OPENAI_FUNCTIONS,\n", " tools=tools,\n", " llm=llm,\n", " verbose=True,\n", ")\n", "\n", "agent({\"input\": \"did alphabet or tesla have more revenue?\"})" ] }, { "cell_type": "code", "execution_count": null, "id": "6db4c853", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" } }, "nbformat": 4, "nbformat_minor": 5 }