From 87fad8fc0005b90563f31976d04cf71ac7de716e Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Mon, 6 Feb 2023 20:02:19 -0800
Subject: [PATCH] analyze document (#731)

add analyze document chain, which does text splitting and then analysis
---
 .../analyze_document.ipynb                    | 178 ++++++++++++++++++
 langchain/chains/__init__.py                  |   2 +
 langchain/chains/combine_documents/base.py    |  36 +++-
 3 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 docs/modules/chains/combine_docs_examples/analyze_document.ipynb

diff --git a/docs/modules/chains/combine_docs_examples/analyze_document.ipynb b/docs/modules/chains/combine_docs_examples/analyze_document.ipynb
new file mode 100644
index 000000000..f451b3dab
--- /dev/null
+++ b/docs/modules/chains/combine_docs_examples/analyze_document.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ad719b65",
+   "metadata": {},
+   "source": [
+    "# Analyze Document\n",
+    "\n",
+    "The AnalyzeDocumentChain is more of an end to chain. This chain takes in a single document, splits it up, and then runs it through a CombineDocumentsChain. This can be used as more of an end-to-end chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "15e1a8a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('../../state_of_the_union.txt') as f:\n",
+    "    state_of_the_union = f.read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14da4012",
+   "metadata": {},
+   "source": [
+    "## Summarize\n",
+    "Let's take a look at it in action below, using it summarize a long document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "765d6326",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import OpenAI\n",
+    "from langchain.chains.summarize import load_summarize_chain\n",
+    "\n",
+    "llm = OpenAI(temperature=0)\n",
+    "summary_chain = load_summarize_chain(llm, chain_type=\"map_reduce\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3a3d3ebc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains import AnalyzeDocumentChain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "97178aad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarize_document_chain = AnalyzeDocumentChain(combine_docs_chain=summary_chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2e5a7bf7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\" In this speech, President Biden addresses the American people and the world, discussing the recent aggression of Russia's Vladimir Putin in Ukraine and the US response. He outlines economic sanctions and other measures taken to hold Putin accountable, and announces the US Department of Justice's task force to go after the crimes of Russian oligarchs. He also announces plans to fight inflation and lower costs for families, invest in American manufacturing, and provide military, economic, and humanitarian assistance to Ukraine. He calls for immigration reform, protecting the rights of women, and advancing the rights of LGBTQ+ Americans, and pays tribute to military families. He concludes with optimism for the future of America.\""
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summarize_document_chain.run(state_of_the_union)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35739404",
+   "metadata": {},
+   "source": [
+    "## Question Answering\n",
+    "Let's take a look at this using a question answering chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8b9b7705",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains.question_answering import load_qa_chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "60c309a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_chain = load_qa_chain(llm, chain_type=\"map_reduce\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ba1fc940",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_document_chain = AnalyzeDocumentChain(combine_docs_chain=qa_chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9aa1fbde",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' The president thanked Justice Breyer for his service.'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qa_document_chain.run(input_document=state_of_the_union, question=\"what did the president say about justice breyer?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eb02f1e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/chains/__init__.py b/langchain/chains/__init__.py
index f63b32d8f..f71659ccb 100644
--- a/langchain/chains/__init__.py
+++ b/langchain/chains/__init__.py
@@ -1,6 +1,7 @@
 """Chains are easily reusable components which can be linked together."""
 from langchain.chains.api.base import APIChain
 from langchain.chains.chat_vector_db.base import ChatVectorDBChain
+from langchain.chains.combine_documents.base import AnalyzeDocumentChain
 from langchain.chains.conversation.base import ConversationChain
 from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
 from langchain.chains.llm import LLMChain
@@ -42,6 +43,7 @@ __all__ = [
     "OpenAIModerationChain",
     "SQLDatabaseSequentialChain",
     "load_chain",
+    "AnalyzeDocumentChain",
     "HypotheticalDocumentEmbedder",
     "ChatVectorDBChain",
 ]
diff --git a/langchain/chains/combine_documents/base.py b/langchain/chains/combine_documents/base.py
index 944440e94..40684e5bc 100644
--- a/langchain/chains/combine_documents/base.py
+++ b/langchain/chains/combine_documents/base.py
@@ -3,10 +3,11 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from langchain.chains.base import Chain
 from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 
 
 class BaseCombineDocumentsChain(Chain, BaseModel, ABC):
@@ -49,3 +50,36 @@ class BaseCombineDocumentsChain(Chain, BaseModel, ABC):
         output, extra_return_dict = self.combine_docs(docs, **other_keys)
         extra_return_dict[self.output_key] = output
         return extra_return_dict
+
+
+class AnalyzeDocumentChain(Chain, BaseModel):
+    """Chain that splits documents, then analyzes it in pieces."""
+
+    input_key: str = "input_document"  #: :meta private:
+    output_key: str = "output_text"  #: :meta private:
+    text_splitter: TextSplitter = Field(default_factory=RecursiveCharacterTextSplitter)
+    combine_docs_chain: BaseCombineDocumentsChain
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Expect input key.
+
+        :meta private:
+        """
+        return [self.input_key]
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Return output key.
+
+        :meta private:
+        """
+        return [self.output_key]
+
+    def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        document = inputs[self.input_key]
+        docs = self.text_splitter.create_documents([document])
+        # Other keys are assumed to be needed for LLM prediction
+        other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
+        other_keys[self.combine_docs_chain.input_key] = docs
+        return self.combine_docs_chain(other_keys, return_only_outputs=True)