{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ec1d7a9a",
   "metadata": {},
   "source": [
    "# Document Comparison\n",
    "\n",
    "This notebook shows how to use an agent to compare two documents.\n",
    "\n",
    "The high level idea is we will create a question-answering chain for each document, and then use that "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8632a37c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from pydantic import BaseModel, Field\n",
    "\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.agents import Tool\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import FAISS\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "from langchain.chains import RetrievalQA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "64f19917",
   "metadata": {},
   "outputs": [],
   "source": [
    "class DocumentInput(BaseModel):\n",
    "    question: str = Field()\n",
    "\n",
    "\n",
    "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
    "\n",
    "tools = []\n",
    "files = [\n",
    "    # https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf\n",
    "    {\n",
    "        \"name\": \"alphabet-earnings\", \n",
    "        \"path\": \"/Users/harrisonchase/Downloads/2023Q1_alphabet_earnings_release.pdf\",\n",
    "    }, \n",
    "    # https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update\n",
    "    {\n",
    "        \"name\": \"tesla-earnings\", \n",
    "        \"path\": \"/Users/harrisonchase/Downloads/TSLA-Q1-2023-Update.pdf\"\n",
    "    }\n",
    "]\n",
    "\n",
    "for file in files:\n",
    "    loader = PyPDFLoader(file[\"path\"])\n",
    "    pages = loader.load_and_split()\n",
    "    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "    docs = text_splitter.split_documents(pages)\n",
    "    embeddings = OpenAIEmbeddings()\n",
    "    retriever = FAISS.from_documents(docs, embeddings).as_retriever()\n",
    "    \n",
    "    # Wrap retrievers in a Tool\n",
    "    tools.append(\n",
    "        Tool(\n",
    "            args_schema=DocumentInput,\n",
    "            name=file[\"name\"], \n",
    "            description=f\"useful when you want to answer questions about {file['name']}\",\n",
    "            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever)\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "eca02549",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.agents import initialize_agent\n",
    "from langchain.agents import AgentType"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c4d56c25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\u001b[1m> Entering new  chain...\u001b[0m\n",
      "\u001b[32;1m\u001b[1;3m\n",
      "Invoking: `alphabet-earnings` with `{'question': 'revenue'}`\n",
      "\n",
      "\n",
      "\u001b[0m\u001b[36;1m\u001b[1;3m{'query': 'revenue', 'result': 'The revenue for Alphabet Inc. in the first quarter of 2023 was $69,787 million.'}\u001b[0m\u001b[32;1m\u001b[1;3m\n",
      "Invoking: `tesla-earnings` with `{'question': 'revenue'}`\n",
      "\n",
      "\n",
      "\u001b[0m\u001b[33;1m\u001b[1;3m{'query': 'revenue', 'result': 'Total revenue for Q1-2023 was $23.3 billion.'}\u001b[0m\u001b[32;1m\u001b[1;3mAlphabet Inc. had more revenue than Tesla. In the first quarter of 2023, Alphabet Inc. had a revenue of $69,787 million, while Tesla had a revenue of $23.3 billion.\u001b[0m\n",
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'input': 'did alphabet or tesla have more revenue?',\n",
       " 'output': 'Alphabet Inc. had more revenue than Tesla. In the first quarter of 2023, Alphabet Inc. had a revenue of $69,787 million, while Tesla had a revenue of $23.3 billion.'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "llm = ChatOpenAI(\n",
    "    temperature=0,\n",
    "    model=\"gpt-3.5-turbo-0613\", \n",
    ")\n",
    "\n",
    "agent = initialize_agent(\n",
    "    agent=AgentType.OPENAI_FUNCTIONS,\n",
    "    tools=tools,\n",
    "    llm=llm,\n",
    "    verbose=True,\n",
    ")\n",
    "\n",
    "agent({\"input\": \"did alphabet or tesla have more revenue?\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6db4c853",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}