From 96843f3bd4e70a953066f19fc9d49c7915e19457 Mon Sep 17 00:00:00 2001
From: Taqi Jaffri
Date: Tue, 1 Aug 2023 12:54:26 -0700
Subject: [PATCH 001/143] Fixed source key name for docugami loader
---
.../document_loaders/docugami.ipynb | 159 +++++++++---------
.../langchain/document_loaders/docugami.py | 6 +-
2 files changed, 86 insertions(+), 79 deletions(-)
diff --git a/docs/extras/integrations/document_loaders/docugami.ipynb b/docs/extras/integrations/document_loaders/docugami.ipynb
index b1386f115..fb12a02b3 100644
--- a/docs/extras/integrations/document_loaders/docugami.ipynb
+++ b/docs/extras/integrations/document_loaders/docugami.ipynb
@@ -15,14 +15,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"tags": []
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: lxml in /root/Source/github/docugami.langchain/libs/langchain/.venv/lib/python3.9/site-packages (4.9.3)\n"
+ ]
+ }
+ ],
"source": [
"# You need the lxml package to use the DocugamiLoader\n",
- "!pip install lxml"
+ "!poetry run pip install lxml"
]
},
{
@@ -50,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -69,34 +77,34 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[Document(page_content='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}),\n",
- " Document(page_content='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Discussions'}),\n",
- " Document(page_content='In consideration of the foregoing, the parties agree as follows:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Consideration'}),\n",
- " Document(page_content='1. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ConfidentialInformation'}),\n",
- " Document(page_content=\"2. Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other party’s Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other party’s Confidential Information as those set forth in this Agreement .\", metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ObligationsAndRestrictions'}),\n",
- " Document(page_content='3. Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Exceptions'}),\n",
- " Document(page_content='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheDate'}),\n",
- " Document(page_content='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
- " Document(page_content='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
- " Document(page_content='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'CompelledDisclosure'}),\n",
- " Document(page_content='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}),\n",
- " Document(page_content='6. No Obligations . Each party retains the right to determine whether to disclose any Confidential Information to the other party.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoObligations'}),\n",
- " Document(page_content='7. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoWarranty'}),\n",
- " Document(page_content='8. Term. This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Term'}),\n",
- " Document(page_content='9. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'EquitableRelief'}),\n",
- " Document(page_content='10. Non-compete. To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Non-compete'}),\n",
- " Document(page_content='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Miscellaneous'}),\n",
- " Document(page_content='[SIGNATURE PAGE FOLLOWS] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheParties'}),\n",
- " Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]"
+ "[Document(page_content='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}),\n",
+ " Document(page_content='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Discussions'}),\n",
+ " Document(page_content='In consideration of the foregoing, the parties agree as follows:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Consideration'}),\n",
+ " Document(page_content='1. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ConfidentialInformation'}),\n",
+ " Document(page_content=\"2. Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other party’s Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other party’s Confidential Information as those set forth in this Agreement .\", metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ObligationsAndRestrictions'}),\n",
+ " Document(page_content='3. Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Exceptions'}),\n",
+ " Document(page_content='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheDate'}),\n",
+ " Document(page_content='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
+ " Document(page_content='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
+ " Document(page_content='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'CompelledDisclosure'}),\n",
+ " Document(page_content='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}),\n",
+ " Document(page_content='6. No Obligations . Each party retains the right to determine whether to disclose any Confidential Information to the other party.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoObligations'}),\n",
+ " Document(page_content='7. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoWarranty'}),\n",
+ " Document(page_content='8. Term. This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Term'}),\n",
+ " Document(page_content='9. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'EquitableRelief'}),\n",
+ " Document(page_content='10. Non-compete. To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Non-compete'}),\n",
+ " Document(page_content='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Miscellaneous'}),\n",
+ " Document(page_content='[SIGNATURE PAGE FOLLOWS] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheParties'}),\n",
+ " Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]"
]
},
- "execution_count": 3,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -116,7 +124,7 @@
"source": [
"The `metadata` for each `Document` (really, a chunk of an actual PDF, DOC or DOCX) contains some useful additional information:\n",
"\n",
- "1. **id and name:** ID and Name of the file (PDF, DOC or DOCX) the chunk is sourced from within Docugami.\n",
+ "1. **id and source:** ID and Name of the file (PDF, DOC or DOCX) the chunk is sourced from within Docugami.\n",
"2. **xpath:** XPath inside the XML representation of the document, for the chunk. Useful for source citations directly to the actual chunk inside the document XML.\n",
"3. **structure:** Structural attributes of the chunk, e.g. h1, h2, div, table, td, etc. Useful to filter out certain kinds of chunks if needed by the caller.\n",
"4. **tag:** Semantic tag for the chunk, using various generative and extractive techniques. More details here: https://github.com/docugami/DFM-benchmarks"
@@ -133,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -142,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -168,17 +176,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 12,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Using embedded DuckDB without persistence: data will be transient\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"embedding = OpenAIEmbeddings()\n",
"vectordb = Chroma.from_documents(documents=documents, embedding=embedding)\n",
@@ -190,21 +190,21 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'query': 'What can tenants do with signage on their properties?',\n",
- " 'result': ' Tenants may place signs (digital or otherwise) or other form of identification on the premises after receiving written permission from the landlord which shall not be unreasonably withheld. The tenant is responsible for any damage caused to the premises and must conform to any applicable laws, ordinances, etc. governing the same. The tenant must also remove and clean any window or glass identification promptly upon vacating the premises.',\n",
- " 'source_documents': [Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage', 'id': 'v1bvgaozfkak', 'name': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC'}),\n",
- " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk', 'id': 'g2fvhekmltza', 'name': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC'}),\n",
- " Document(page_content='Landlord , its agents, servants, employees, licensees, invitees, and contractors during the last year of the term of this Lease at any and all times during regular business hours, after 24 hour notice to tenant, to pass and repass on and through the Premises, or such portion thereof as may be necessary, in order that they or any of them may gain access to the Premises for the purpose of showing the Premises to potential new tenants or real estate brokers. In addition, Landlord shall be entitled to place a \"FOR RENT \" or \"FOR LEASE\" sign (not exceeding 8.5 ” x 11 ”) in the front window of the Premises during the last six months of the term of this Lease .', metadata={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42FLandlordSAccess-section/docset:_42FLandlordSAccess/docset:LandlordsRights/docset:Landlord', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'Landlord', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}),\n",
- " Document(page_content=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS', 'id': 'qkn9cyqsiuch', 'name': 'Shorebucks LLC_AZ.pdf', 'structure': 'div', 'tag': 'SIGNS', 'Landlord': 'Menlo Group', 'Tenant': 'Shorebucks LLC'})]}"
+ " 'result': ' Tenants can place or attach signs (digital or otherwise) or other forms of identification to their premises, as long as they receive written permission from the landlord and the signs or other forms of identification conform to all applicable laws and ordinances.',\n",
+ " 'source_documents': [Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
+ " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
+ " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'}),\n",
+ " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'})]}"
]
},
- "execution_count": 7,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -227,23 +227,30 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 19,
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "query='rentable area' filter=Comparison(comparator=, attribute='Landlord', value='DHA Group') limit=None\n"
+ ]
+ },
{
"data": {
"text/plain": [
- "' 9,753 square feet'"
+ "' 13,500 square feet.'"
]
},
- "execution_count": 8,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain_response = qa_chain(\"What is rentable area for the property owned by DHA Group?\")\n",
- "chain_response[\"result\"] # the correct answer should be 13,500"
+ "chain_response[\"result\"]"
]
},
{
@@ -255,19 +262,19 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content='WITNESSES: LANDLORD: DHA Group , a Delaware limited liability company', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:SIGNATURESONNEXTPAGE-section/docset:INWITNESSWHEREOF-section/docset:INWITNESSWHEREOF/docset:Behalf/docset:Witnesses/xhtml:table/xhtml:tbody/xhtml:tr[3]/xhtml:td[2]/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content=\"1.16 Landlord 's Notice Address . DHA Group , Suite 1010 , 111 Bauer Dr , Oakland , New Jersey , 07436 , with a copy to the Building Management Office at the Project , Attention: On - Site Property Manager .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'id': 'dsyfhh4vpeyf', 'name': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC'})]"
+ "[Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'})]"
]
},
- "execution_count": 9,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -287,7 +294,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -295,14 +302,14 @@
"text/plain": [
"{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOfficeLeaseAgreement',\n",
" 'id': 'v1bvgaozfkak',\n",
- " 'name': 'TruTone Lane 2.docx',\n",
+ " 'source': 'TruTone Lane 2.docx',\n",
" 'structure': 'p',\n",
" 'tag': 'ThisOfficeLeaseAgreement',\n",
" 'Landlord': 'BUBBA CENTER PARTNERSHIP',\n",
" 'Tenant': 'Truetone Lane LLC'}"
]
},
- "execution_count": 10,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -322,17 +329,9 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 17,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Using embedded DuckDB without persistence: data will be transient\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from langchain.chains.query_constructor.schema import AttributeInfo\n",
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
@@ -369,14 +368,22 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 18,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/root/Source/github/docugami.langchain/libs/langchain/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
+ " warnings.warn(\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "query='rentable area' filter=Comparison(comparator=, attribute='Landlord', value='DHA Group')\n"
+ "query='rentable area' filter=Comparison(comparator=, attribute='Landlord', value='DHA Group') limit=None\n"
]
},
{
@@ -384,13 +391,13 @@
"text/plain": [
"{'query': 'What is rentable area for the property owned by DHA Group?',\n",
" 'result': ' 13,500 square feet.',\n",
- " 'source_documents': [Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content='WITNESSES: LANDLORD: DHA Group , a Delaware limited liability company', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:SIGNATURESONNEXTPAGE-section/docset:INWITNESSWHEREOF-section/docset:INWITNESSWHEREOF/docset:Behalf/docset:Witnesses/xhtml:table/xhtml:tbody/xhtml:tr[3]/xhtml:td[2]/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content=\"1.16 Landlord 's Notice Address . DHA Group , Suite 1010 , 111 Bauer Dr , Oakland , New Jersey , 07436 , with a copy to the Building Management Office at the Project , Attention: On - Site Property Manager .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'})]}"
+ " 'source_documents': [Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.11 Percentage Rent . (a) 55 % of Gross Revenue to Landlord until Landlord receives Percentage Rent in an amount equal to the Annual Market Rent Hurdle (as escalated); and', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'GrossRevenue', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent-section/docset:PercentageRent[2]/docset:PercentageRent/docset:GrossRevenue[1]/docset:GrossRevenue'})]}"
]
},
- "execution_count": 12,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -423,7 +430,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.9.16"
}
},
"nbformat": 4,
diff --git a/libs/langchain/langchain/document_loaders/docugami.py b/libs/langchain/langchain/document_loaders/docugami.py
index b60326f24..ede8908d6 100644
--- a/libs/langchain/langchain/document_loaders/docugami.py
+++ b/libs/langchain/langchain/document_loaders/docugami.py
@@ -18,7 +18,7 @@ TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
XPATH_KEY = "xpath"
DOCUMENT_ID_KEY = "id"
-DOCUMENT_NAME_KEY = "name"
+DOCUMENT_SOURCE_KEY = "source"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
PROJECTS_KEY = "projects"
@@ -146,7 +146,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document["id"],
- DOCUMENT_NAME_KEY: document["name"],
+ DOCUMENT_SOURCE_KEY: document["name"],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
}
@@ -349,7 +349,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
chunks += self._parse_dgml(
{
DOCUMENT_ID_KEY: path.name,
- DOCUMENT_NAME_KEY: path.name,
+ DOCUMENT_SOURCE_KEY: path.name,
},
file.read(),
)
From 4806504ebc73c6f311606cdca353708b02e64aea Mon Sep 17 00:00:00 2001
From: Taqi Jaffri
Date: Tue, 1 Aug 2023 15:43:26 -0700
Subject: [PATCH 002/143] Fixed one last key name
---
.../document_loaders/docugami.ipynb | 73 ++++++++-----------
.../langchain/document_loaders/docugami.py | 4 +-
2 files changed, 31 insertions(+), 46 deletions(-)
diff --git a/docs/extras/integrations/document_loaders/docugami.ipynb b/docs/extras/integrations/document_loaders/docugami.ipynb
index fb12a02b3..346df6096 100644
--- a/docs/extras/integrations/document_loaders/docugami.ipynb
+++ b/docs/extras/integrations/document_loaders/docugami.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 20,
"metadata": {
"tags": []
},
@@ -58,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -77,7 +77,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -104,7 +104,7 @@
" Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]"
]
},
- "execution_count": 9,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -141,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -176,7 +176,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -190,21 +190,21 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'query': 'What can tenants do with signage on their properties?',\n",
- " 'result': ' Tenants can place or attach signs (digital or otherwise) or other forms of identification to their premises, as long as they receive written permission from the landlord and the signs or other forms of identification conform to all applicable laws and ordinances.',\n",
- " 'source_documents': [Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
- " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
+ " 'result': ' Tenants can place or attach signs (digital or otherwise) to their properties, after receiving written permission from their landlord. Any signs must conform to all applicable laws, ordinances, etc. governing the same.',\n",
+ " 'source_documents': [Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'}),\n",
" Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'}),\n",
- " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'})]}"
+ " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
+ " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'})]}"
]
},
- "execution_count": 13,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -227,23 +227,16 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 27,
"metadata": {},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "query='rentable area' filter=Comparison(comparator=, attribute='Landlord', value='DHA Group') limit=None\n"
- ]
- },
{
"data": {
"text/plain": [
- "' 13,500 square feet.'"
+ "\" I don't know.\""
]
},
- "execution_count": 19,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -262,19 +255,19 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'})]"
+ "[Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
+ " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
+ " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
+ " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'})]"
]
},
- "execution_count": 15,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -294,7 +287,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -309,7 +302,7 @@
" 'Tenant': 'Truetone Lane LLC'}"
]
},
- "execution_count": 16,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -329,7 +322,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -368,17 +361,9 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 31,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/root/Source/github/docugami.langchain/libs/langchain/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
- " warnings.warn(\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
@@ -390,14 +375,14 @@
"data": {
"text/plain": [
"{'query': 'What is rentable area for the property owned by DHA Group?',\n",
- " 'result': ' 13,500 square feet.',\n",
+ " 'result': ' The rentable area for the property owned by DHA Group is 13,500 square feet.',\n",
" 'source_documents': [Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.11 Percentage Rent . (a) 55 % of Gross Revenue to Landlord until Landlord receives Percentage Rent in an amount equal to the Annual Market Rent Hurdle (as escalated); and', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'GrossRevenue', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent-section/docset:PercentageRent[2]/docset:PercentageRent/docset:GrossRevenue[1]/docset:GrossRevenue'})]}"
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'})]}"
]
},
- "execution_count": 18,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/libs/langchain/langchain/document_loaders/docugami.py b/libs/langchain/langchain/document_loaders/docugami.py
index ede8908d6..dabe51038 100644
--- a/libs/langchain/langchain/document_loaders/docugami.py
+++ b/libs/langchain/langchain/document_loaders/docugami.py
@@ -145,8 +145,8 @@ class DocugamiLoader(BaseLoader, BaseModel):
"""Create a Document from a node and text."""
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
- DOCUMENT_ID_KEY: document["id"],
- DOCUMENT_SOURCE_KEY: document["name"],
+ DOCUMENT_ID_KEY: document[DOCUMENT_ID_KEY],
+ DOCUMENT_SOURCE_KEY: document[DOCUMENT_SOURCE_KEY],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
}
From 5919c0f4a22f37d1d08429c8c2367dcc8cd95bb0 Mon Sep 17 00:00:00 2001
From: Taqi Jaffri
Date: Tue, 8 Aug 2023 21:38:55 -0700
Subject: [PATCH 003/143] notebook cleanup
---
.../document_loaders/docugami.ipynb | 88 +++++++++++--------
.../langchain/document_loaders/docugami.py | 5 +-
2 files changed, 53 insertions(+), 40 deletions(-)
diff --git a/docs/extras/integrations/document_loaders/docugami.ipynb b/docs/extras/integrations/document_loaders/docugami.ipynb
index 346df6096..d3f94a8d1 100644
--- a/docs/extras/integrations/document_loaders/docugami.ipynb
+++ b/docs/extras/integrations/document_loaders/docugami.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 1,
"metadata": {
"tags": []
},
@@ -58,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -77,7 +77,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -104,7 +104,7 @@
" Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'source': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]"
]
},
- "execution_count": 22,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -141,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -176,7 +176,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -190,21 +190,21 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'query': 'What can tenants do with signage on their properties?',\n",
- " 'result': ' Tenants can place or attach signs (digital or otherwise) to their properties, after receiving written permission from their landlord. Any signs must conform to all applicable laws, ordinances, etc. governing the same.',\n",
- " 'source_documents': [Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'}),\n",
- " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk'}),\n",
- " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
- " Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'})]}"
+ " 'result': \" Tenants can place or attach signs (digital or otherwise) to their premises with written permission from the landlord. The signs must conform to all applicable laws, ordinances, etc. governing the same. Tenants can also have their name listed in the building's directory at the landlord's cost.\",\n",
+ " 'source_documents': [Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Lease Date': 'April 24 \\n\\n ,', 'Lease Parties': 'This OFFICE LEASE AGREEMENT (this \"Lease\") is made and entered into by and between BUBBA CENTER PARTNERSHIP (\" Landlord \"), and Truetone Lane LLC , a Delaware limited liability company (\" Tenant \").', 'Tenant': 'Truetone Lane LLC', 'id': 'v1bvgaozfkak', 'source': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage'}),\n",
+ " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'Landlord': 'GLORY ROAD LLC', 'Lease Date': 'April 30 , 2020', 'Lease Parties': 'This OFFICE LEASE AGREEMENT (this \"Lease\") is made and entered into by and between GLORY ROAD LLC (\" Landlord \"), and Truetone Lane LLC , a Delaware limited liability company (\" Tenant \").', 'Tenant': 'Truetone Lane LLC', 'id': 'g2fvhekmltza', 'source': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ArticleIiiUse/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:AnyTime/docset:Addition/dg:chunk'}),\n",
+ " Document(page_content='Landlord , its agents, servants, employees, licensees, invitees, and contractors during the last year of the term of this Lease at any and all times during regular business hours, after 24 hour notice to tenant, to pass and repass on and through the Premises, or such portion thereof as may be necessary, in order that they or any of them may gain access to the Premises for the purpose of showing the Premises to potential new tenants or real estate brokers. In addition, Landlord shall be entitled to place a \"FOR RENT \" or \"FOR LEASE\" sign (not exceeding 8.5 ” x 11 ”) in the front window of the Premises during the last six months of the term of this Lease .', metadata={'Landlord': 'BIRCH STREET , LLC', 'Lease Date': 'October 15 , 2021', 'Lease Parties': 'The provisions of this rider are hereby incorporated into and made a part of the Lease dated as of October 15 , 2021 between BIRCH STREET , LLC , having an address at c/o Birch Palace , 6 Grace Avenue Suite 200 , Great Neck , New York 11021 (\" Landlord \"), and Trutone Lane LLC , having an address at 4 Pearl Street , New York , New York 10012 (\" Tenant \") of Premises known as the ground floor space and lower level space, as per floor plan annexed hereto and made a part hereof as Exhibit A (“Premises”) at 4 Pearl Street , New York , New York 10012 in the City of New York , Borough of Manhattan , to which this rider is annexed. If there is any conflict between the provisions of this rider and the remainder of this Lease , the provisions of this rider shall govern.', 'Tenant': 'Trutone Lane LLC', 'id': 'omvs4mysdk6b', 'source': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'Landlord', 'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42FLandlordSAccess-section/docset:_42FLandlordSAccess/docset:LandlordsRights/docset:Landlord'}),\n",
+ " Document(page_content=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", metadata={'Landlord': 'Perry & Blair LLC', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'SIGNS', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:ThisLease-section/docset:ThisLease/docset:Guaranty-section/docset:Guaranty[2]/docset:TheTransfer/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS'})]}"
]
},
- "execution_count": 26,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -227,47 +227,47 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "\" I don't know.\""
+ "' 9,753 square feet.'"
]
},
- "execution_count": 27,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain_response = qa_chain(\"What is rentable area for the property owned by DHA Group?\")\n",
- "chain_response[\"result\"]"
+ "chain_response[\"result\"] # correct answer should be 13,500 sq ft"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "At first glance the answer may seem reasonable, but if you review the source chunks carefully for this answer, you will see that the chunking of the document did not end up putting the Landlord name and the rentable area in the same context, since they are far apart in the document. The retriever therefore ends up finding unrelated chunks from other documents not even related to the **Menlo Group** landlord. That landlord happens to be mentioned on the first page of the file **Shorebucks LLC_NJ.pdf** file, and while one of the source chunks used by the chain is indeed from that doc that contains the correct answer (**13,500**), other source chunks from different docs are included, and the answer is therefore incorrect."
+ "At first glance the answer may seem reasonable, but if you review the source chunks carefully for this answer, you will see that the chunking of the document did not end up putting the Landlord name and the rentable area in the same context, since they are far apart in the document. The retriever therefore ends up finding unrelated chunks from other documents not even related to the **DHA Group** landlord. That landlord happens to be mentioned on the first page of the file **Shorebucks LLC_NJ.pdf** file, and while one of the source chunks used by the chain is indeed from that doc that contains the correct answer (**13,500**), other source chunks from different docs are included, and the answer is therefore incorrect."
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
- " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
- " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
- " Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'})]"
+ "[Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup'}),\n",
+ " Document(page_content='WITNESSES: LANDLORD: DHA Group , a Delaware limited liability company', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'DhaGroup', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:SIGNATURESONNEXTPAGE-section/docset:INWITNESSWHEREOF-section/docset:INWITNESSWHEREOF/docset:Behalf/docset:Witnesses/xhtml:table/xhtml:tbody/xhtml:tr[3]/xhtml:td[2]/docset:DhaGroup'}),\n",
+ " Document(page_content=\"1.16 Landlord 's Notice Address . DHA Group , Suite 1010 , 111 Bauer Dr , Oakland , New Jersey , 07436 , with a copy to the Building Management Office at the Project , Attention: On - Site Property Manager .\", metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'Perry & Blair LLC', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'dsyfhh4vpeyf', 'source': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'})]"
]
},
- "execution_count": 28,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -287,22 +287,24 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOfficeLeaseAgreement',\n",
+ "{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:LeaseParties',\n",
" 'id': 'v1bvgaozfkak',\n",
" 'source': 'TruTone Lane 2.docx',\n",
" 'structure': 'p',\n",
- " 'tag': 'ThisOfficeLeaseAgreement',\n",
+ " 'tag': 'LeaseParties',\n",
+ " 'Lease Date': 'April 24 \\n\\n ,',\n",
" 'Landlord': 'BUBBA CENTER PARTNERSHIP',\n",
- " 'Tenant': 'Truetone Lane LLC'}"
+ " 'Tenant': 'Truetone Lane LLC',\n",
+ " 'Lease Parties': 'This OFFICE LEASE AGREEMENT (this \"Lease\") is made and entered into by and between BUBBA CENTER PARTNERSHIP (\" Landlord \"), and Truetone Lane LLC , a Delaware limited liability company (\" Tenant \").'}"
]
},
- "execution_count": 29,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -322,7 +324,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -361,9 +363,17 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 12,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/root/Source/github/docugami.langchain/libs/langchain/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
+ " warnings.warn(\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
@@ -376,19 +386,21 @@
"text/plain": [
"{'query': 'What is rentable area for the property owned by DHA Group?',\n",
" 'result': ' The rentable area for the property owned by DHA Group is 13,500 square feet.',\n",
- " 'source_documents': [Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
- " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'})]}"
+ " 'source_documents': [Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises'}),\n",
+ " Document(page_content='1.11 Percentage Rent . (a) 55 % of Gross Revenue to Landlord until Landlord receives Percentage Rent in an amount equal to the Annual Market Rent Hurdle (as escalated); and', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'GrossRevenue', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent-section/docset:PercentageRent[2]/docset:PercentageRent/docset:GrossRevenue[1]/docset:GrossRevenue'}),\n",
+ " Document(page_content='1.11 Percentage Rent . (a) 55 % of Gross Revenue to Landlord until Landlord receives Percentage Rent in an amount equal to the Annual Market Rent Hurdle (as escalated); and', metadata={'Landlord': 'DHA Group', 'Lease Date': 'March 29th , 2019', 'Lease Parties': 'THIS OFFICE LEASE (the \"Lease\") is made and entered into as of March 29th , 2019 , by and between Landlord and Tenant . \"Date of this Lease\" shall mean the date on which the last one of the Landlord and Tenant has signed this Lease .', 'Tenant': 'Shorebucks LLC', 'id': 'md8rieecquyv', 'source': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'GrossRevenue', 'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent/docset:PercentageRent-section/docset:PercentageRent[2]/docset:PercentageRent/docset:GrossRevenue[1]/docset:GrossRevenue'})]}"
]
},
- "execution_count": 31,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "qa_chain(\"What is rentable area for the property owned by DHA Group?\")"
+ "qa_chain(\n",
+ " \"What is rentable area for the property owned by DHA Group?\"\n",
+ ") # correct answer should be 13,500 sq ft"
]
},
{
diff --git a/libs/langchain/langchain/document_loaders/docugami.py b/libs/langchain/langchain/document_loaders/docugami.py
index dabe51038..af2c95f57 100644
--- a/libs/langchain/langchain/document_loaders/docugami.py
+++ b/libs/langchain/langchain/document_loaders/docugami.py
@@ -19,6 +19,7 @@ TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
XPATH_KEY = "xpath"
DOCUMENT_ID_KEY = "id"
DOCUMENT_SOURCE_KEY = "source"
+DOCUMENT_NAME_KEY = "name"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
PROJECTS_KEY = "projects"
@@ -146,7 +147,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document[DOCUMENT_ID_KEY],
- DOCUMENT_SOURCE_KEY: document[DOCUMENT_SOURCE_KEY],
+ DOCUMENT_SOURCE_KEY: document[DOCUMENT_NAME_KEY],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
}
@@ -349,7 +350,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
chunks += self._parse_dgml(
{
DOCUMENT_ID_KEY: path.name,
- DOCUMENT_SOURCE_KEY: path.name,
+ DOCUMENT_NAME_KEY: path.name,
},
file.read(),
)
From 50b13ab9384932b9dd4aef64dd08150dbb6a5655 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Wed, 9 Aug 2023 13:26:09 -0700
Subject: [PATCH 004/143] wip
---
libs/langchain/langchain/chains/base.py | 8 +-
libs/langchain/langchain/schema/runnable.py | 158 ++++++------------
.../smith/evaluation/runner_utils.py | 8 +-
3 files changed, 60 insertions(+), 114 deletions(-)
diff --git a/libs/langchain/langchain/chains/base.py b/libs/langchain/langchain/chains/base.py
index 301b0143e..a490c5831 100644
--- a/libs/langchain/langchain/chains/base.py
+++ b/libs/langchain/langchain/chains/base.py
@@ -62,7 +62,9 @@ class Chain(Serializable, Runnable[Dict[str, Any], Dict[str, Any]], ABC):
config: Optional[RunnableConfig] = None,
**kwargs: Any,
) -> Dict[str, Any]:
- return self(input, **(config or {}), **kwargs)
+ _config: Dict[str, Any] = dict(config) if config else {}
+ _config.pop("_locals", None)
+ return self(input, **_config, **kwargs)
async def ainvoke(
self,
@@ -76,7 +78,9 @@ class Chain(Serializable, Runnable[Dict[str, Any], Dict[str, Any]], ABC):
None, partial(self.invoke, input, config, **kwargs)
)
- return await self.acall(input, **(config or {}), **kwargs)
+ _config: Dict[str, Any] = dict(config) if config else {}
+ _config.pop("_locals", None)
+ return await self.acall(input, **_config, **kwargs)
memory: Optional[BaseMemory] = None
"""Optional memory object. Defaults to None.
diff --git a/libs/langchain/langchain/schema/runnable.py b/libs/langchain/langchain/schema/runnable.py
index 8edafe459..eebd5a96a 100644
--- a/libs/langchain/langchain/schema/runnable.py
+++ b/libs/langchain/langchain/schema/runnable.py
@@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
from itertools import tee
from typing import (
Any,
@@ -66,6 +67,35 @@ class RunnableConfig(TypedDict, total=False):
Tags are passed to all callbacks, metadata is passed to handle*Start callbacks.
"""
+ _locals: Dict[str, Any]
+ """
+ Local variables
+ """
+
+
+def _empty_config() -> RunnableConfig:
+ return RunnableConfig(tags=[], metadata={}, callbacks=None, _locals={})
+
+
+def _get_callback_manager(config: Mapping) -> Any:
+ from langchain.callbacks.manager import CallbackManager
+
+ return CallbackManager.configure(
+ inheritable_callbacks=config.get("callbacks"),
+ inheritable_tags=config.get("tags"),
+ inheritable_metadata=config.get("metadata"),
+ )
+
+
+def _get_async_callback_manager(config: Mapping) -> Any:
+ from langchain.callbacks.manager import AsyncCallbackManager
+
+ return AsyncCallbackManager.configure(
+ inheritable_callbacks=config.get("callbacks"),
+ inheritable_tags=config.get("tags"),
+ inheritable_metadata=config.get("metadata"),
+ )
+
Input = TypeVar("Input")
# Output type should implement __concat__, as eg str, list, dict do
@@ -243,7 +273,7 @@ class Runnable(Generic[Input, Output], ABC):
return (
config
if isinstance(config, list)
- else [config.copy() if config is not None else {} for _ in range(length)]
+ else [deepcopy(config) if config is not None else {} for _ in range(length)]
)
def _call_with_config(
@@ -255,14 +285,8 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement invoke() in subclasses."""
- from langchain.callbacks.manager import CallbackManager
-
config = config or {}
- callback_manager = CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
+ callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
input if isinstance(input, dict) else {"input": input},
@@ -288,14 +312,8 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement ainvoke() in subclasses."""
- from langchain.callbacks.manager import AsyncCallbackManager
-
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
+ callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
input if isinstance(input, dict) else {"input": input},
@@ -322,8 +340,6 @@ class Runnable(Generic[Input, Output], ABC):
"""Helper method to transform an Iterator of Input values into an Iterator of
Output values, with callbacks.
Use this to implement `stream()` or `transform()` in Runnable subclasses."""
- from langchain.callbacks.manager import CallbackManager
-
# tee the input so we can iterate over it twice
input_for_tracing, input_for_transform = tee(input, 2)
# Start the input iterator to ensure the input runnable starts before this one
@@ -333,11 +349,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output_supported = True
config = config or {}
- callback_manager = CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
+ callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
{"input": ""},
@@ -393,8 +405,6 @@ class Runnable(Generic[Input, Output], ABC):
"""Helper method to transform an Async Iterator of Input values into an Async
Iterator of Output values, with callbacks.
Use this to implement `astream()` or `atransform()` in Runnable subclasses."""
- from langchain.callbacks.manager import AsyncCallbackManager
-
# tee the input so we can iterate over it twice
input_for_tracing, input_for_transform = atee(input, 2)
# Start the input iterator to ensure the input runnable starts before this one
@@ -404,11 +414,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output_supported = True
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
+ callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
{"input": ""},
@@ -473,19 +479,9 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
yield from self.fallbacks
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
- from langchain.callbacks.manager import CallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -516,19 +512,9 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
async def ainvoke(
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
- from langchain.callbacks.manager import AsyncCallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -751,19 +737,9 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
)
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
- from langchain.callbacks.manager import CallbackManager
-
# setup callbacks
- config = config or {}
- callback_manager = CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ config = config or _empty_config()
+ callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -771,11 +747,12 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
# invoke all steps in sequence
try:
+ callbacks = run_manager.get_child()
for step in self.steps:
input = step.invoke(
input,
# mark each step as a child run
- _patch_config(config, run_manager.get_child()),
+ _patch_config(config, callbacks),
)
# finish the root run
except (KeyboardInterrupt, Exception) as e:
@@ -790,19 +767,9 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
async def ainvoke(
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
- from langchain.callbacks.manager import AsyncCallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -946,19 +913,9 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
def stream(
self, input: Input, config: Optional[RunnableConfig] = None
) -> Iterator[Output]:
- from langchain.callbacks.manager import CallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -1023,19 +980,9 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
async def astream(
self, input: Input, config: Optional[RunnableConfig] = None
) -> AsyncIterator[Output]:
- from langchain.callbacks.manager import AsyncCallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -1173,19 +1120,9 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
async def ainvoke(
self, input: Input, config: Optional[RunnableConfig] = None
) -> Dict[str, Any]:
- from langchain.callbacks.manager import AsyncCallbackManager
-
# setup callbacks
config = config or {}
- callback_manager = AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- local_callbacks=None,
- verbose=False,
- inheritable_tags=config.get("tags"),
- local_tags=None,
- inheritable_metadata=config.get("metadata"),
- local_metadata=None,
- )
+ callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), {"input": input}
@@ -1464,10 +1401,11 @@ class RouterRunnable(
def _patch_config(
- config: RunnableConfig, callback_manager: BaseCallbackManager
+ config: RunnableConfig, callback_manager: BaseCallbackManager, _locals: Optional[Dict[str, Any]] = None
) -> RunnableConfig:
- config = config.copy()
+ config = deepcopy(config)
config["callbacks"] = callback_manager
+ config["_locals"] = _locals or {}
return config
diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py
index 5b3d5775c..be55f6f99 100644
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -636,7 +636,9 @@ async def _arun_chain(
else:
output = await chain.acall(inputs_, callbacks=callbacks, tags=tags)
else:
- runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
+ runnable_config = RunnableConfig(
+ tags=tags or [], callbacks=callbacks, _locals={}
+ )
output = await chain.ainvoke(inputs_, config=runnable_config)
return output
@@ -957,7 +959,9 @@ def _run_chain(
else:
output = chain(inputs_, callbacks=callbacks, tags=tags)
else:
- runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
+ runnable_config = RunnableConfig(
+ tags=tags or [], callbacks=callbacks, _locals={}
+ )
output = chain.invoke(inputs_, config=runnable_config)
return output
From eb0134fbb3c728fb5c9180384276315f6318497b Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Wed, 9 Aug 2023 14:13:06 -0700
Subject: [PATCH 005/143] rfc
---
libs/langchain/langchain/chat_models/base.py | 8 ++-
libs/langchain/langchain/llms/base.py | 15 +++---
libs/langchain/langchain/schema/retriever.py | 8 ++-
libs/langchain/langchain/schema/runnable.py | 57 +++++++++++++++++---
4 files changed, 69 insertions(+), 19 deletions(-)
diff --git a/libs/langchain/langchain/chat_models/base.py b/libs/langchain/langchain/chat_models/base.py
index b06b99f99..3d343274f 100644
--- a/libs/langchain/langchain/chat_models/base.py
+++ b/libs/langchain/langchain/chat_models/base.py
@@ -103,12 +103,14 @@ class BaseChatModel(BaseLanguageModel[BaseMessageChunk], ABC):
stop: Optional[List[str]] = None,
**kwargs: Any,
) -> BaseMessageChunk:
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
return cast(
BaseMessageChunk,
cast(
ChatGeneration,
self.generate_prompt(
- [self._convert_input(input)], stop=stop, **(config or {}), **kwargs
+ [self._convert_input(input)], stop=stop, **_config, **kwargs
).generations[0][0],
).message,
)
@@ -127,8 +129,10 @@ class BaseChatModel(BaseLanguageModel[BaseMessageChunk], ABC):
None, partial(self.invoke, input, config, stop=stop, **kwargs)
)
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
llm_result = await self.agenerate_prompt(
- [self._convert_input(input)], stop=stop, **(config or {}), **kwargs
+ [self._convert_input(input)], stop=stop, **_config, **kwargs
)
return cast(
BaseMessageChunk, cast(ChatGeneration, llm_result.generations[0][0]).message
diff --git a/libs/langchain/langchain/llms/base.py b/libs/langchain/langchain/llms/base.py
index 7da494de7..044221247 100644
--- a/libs/langchain/langchain/llms/base.py
+++ b/libs/langchain/langchain/llms/base.py
@@ -219,13 +219,12 @@ class BaseLLM(BaseLanguageModel[str], ABC):
stop: Optional[List[str]] = None,
**kwargs: Any,
) -> str:
- return (
- self.generate_prompt(
- [self._convert_input(input)], stop=stop, **(config or {}), **kwargs
- )
- .generations[0][0]
- .text
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
+ result = self.generate_prompt(
+ [self._convert_input(input)], stop=stop, **_config, **kwargs
)
+ return result.generations[0][0].text
async def ainvoke(
self,
@@ -241,8 +240,10 @@ class BaseLLM(BaseLanguageModel[str], ABC):
None, partial(self.invoke, input, config, stop=stop, **kwargs)
)
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
llm_result = await self.agenerate_prompt(
- [self._convert_input(input)], stop=stop, **(config or {}), **kwargs
+ [self._convert_input(input)], stop=stop, **_config, **kwargs
)
return llm_result.generations[0][0].text
diff --git a/libs/langchain/langchain/schema/retriever.py b/libs/langchain/langchain/schema/retriever.py
index 9df3e7a13..538ae1ed1 100644
--- a/libs/langchain/langchain/schema/retriever.py
+++ b/libs/langchain/langchain/schema/retriever.py
@@ -107,7 +107,9 @@ class BaseRetriever(Serializable, Runnable[str, List[Document]], ABC):
def invoke(
self, input: str, config: Optional[RunnableConfig] = None
) -> List[Document]:
- return self.get_relevant_documents(input, **(config or {}))
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
+ return self.get_relevant_documents(input, **_config)
async def ainvoke(
self, input: str, config: Optional[RunnableConfig] = None
@@ -116,7 +118,9 @@ class BaseRetriever(Serializable, Runnable[str, List[Document]], ABC):
# If the retriever doesn't implement async, use default implementation
return await super().ainvoke(input, config)
- return await self.aget_relevant_documents(input, **(config or {}))
+ _config: Dict[str, Any] = dict(config or {})
+ _config.pop("_locals", None)
+ return await self.aget_relevant_documents(input, **_config)
@abstractmethod
def _get_relevant_documents(
diff --git a/libs/langchain/langchain/schema/runnable.py b/libs/langchain/langchain/schema/runnable.py
index eebd5a96a..47679c888 100644
--- a/libs/langchain/langchain/schema/runnable.py
+++ b/libs/langchain/langchain/schema/runnable.py
@@ -674,6 +674,46 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
raise first_error
+class PutLocalVar(Serializable, Runnable[Input, Input]):
+ key: Union[str, Dict[str, str]]
+
+ def __init__(self, key: str, **kwargs: Any) -> None:
+ super().__init__(key=key, **kwargs)
+
+ def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Input:
+ if config is None:
+ raise ValueError(
+ "PutLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+ if isinstance(self.key, str):
+ config["_locals"][self.key] = input
+ else:
+ if not isinstance(input, Mapping):
+ raise ValueError
+ for get_key, put_key in self.key.items():
+ config["_locals"][put_key] = input[get_key]
+ return self._call_with_config(lambda x: x, input, config)
+
+
+class GetLocalVar(Serializable, Runnable[str, Any]):
+ key: str
+ passthrough_key: Optional[str] = None
+
+ def __init__(self, key: str, **kwargs: Any) -> None:
+ super().__init__(key=key, **kwargs)
+
+ def invoke(self, input: str, config: Optional[RunnableConfig] = None) -> Any:
+ if config is None:
+ raise ValueError(
+ "PutLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+ if self.passthrough_key is not None:
+ return {self.key: config["_locals"][self.key], self.passthrough_key: input}
+ return config["_locals"][self.key]
+
+
class RunnableSequence(Serializable, Runnable[Input, Output]):
"""
A sequence of runnables, where the output of each is the input of the next.
@@ -749,11 +789,9 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
try:
callbacks = run_manager.get_child()
for step in self.steps:
- input = step.invoke(
- input,
- # mark each step as a child run
- _patch_config(config, callbacks),
- )
+ # mark each step as child run
+ step_config = _patch_config(config, callbacks)
+ input = step.invoke(input, step_config)
# finish the root run
except (KeyboardInterrupt, Exception) as e:
run_manager.on_chain_error(e)
@@ -1401,11 +1439,14 @@ class RouterRunnable(
def _patch_config(
- config: RunnableConfig, callback_manager: BaseCallbackManager, _locals: Optional[Dict[str, Any]] = None
+ config: RunnableConfig,
+ callback_manager: BaseCallbackManager,
+ _locals: Optional[Dict[str, Any]] = None,
) -> RunnableConfig:
- config = deepcopy(config)
+ config = config.copy()
config["callbacks"] = callback_manager
- config["_locals"] = _locals or {}
+ if _locals is not None:
+ config["_locals"] = _locals
return config
From 8c1a528c7150a4cc833cedb567b668c3ab17a745 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 13:52:09 -0700
Subject: [PATCH 006/143] cr
---
.../langchain/schema/runnable/base.py | 40 -------------------
.../langchain/schema/runnable/passthrough.py | 5 ++-
2 files changed, 3 insertions(+), 42 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 9354355fa..ee3f7c114 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -653,46 +653,6 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
raise first_error
-class PutLocalVar(Serializable, Runnable[Input, Input]):
- key: Union[str, Dict[str, str]]
-
- def __init__(self, key: str, **kwargs: Any) -> None:
- super().__init__(key=key, **kwargs)
-
- def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Input:
- if config is None:
- raise ValueError(
- "PutLocalVar should only be used in a RunnableSequence, and should "
- "therefore always receive a non-null config."
- )
- if isinstance(self.key, str):
- config["_locals"][self.key] = input
- else:
- if not isinstance(input, Mapping):
- raise ValueError
- for get_key, put_key in self.key.items():
- config["_locals"][put_key] = input[get_key]
- return self._call_with_config(lambda x: x, input, config)
-
-
-class GetLocalVar(Serializable, Runnable[str, Any]):
- key: str
- passthrough_key: Optional[str] = None
-
- def __init__(self, key: str, **kwargs: Any) -> None:
- super().__init__(key=key, **kwargs)
-
- def invoke(self, input: str, config: Optional[RunnableConfig] = None) -> Any:
- if config is None:
- raise ValueError(
- "PutLocalVar should only be used in a RunnableSequence, and should "
- "therefore always receive a non-null config."
- )
- if self.passthrough_key is not None:
- return {self.key: config["_locals"][self.key], self.passthrough_key: input}
- return config["_locals"][self.key]
-
-
class RunnableSequence(Serializable, Runnable[Input, Output]):
"""
A sequence of runnables, where the output of each is the input of the next.
diff --git a/libs/langchain/langchain/schema/runnable/passthrough.py b/libs/langchain/langchain/schema/runnable/passthrough.py
index a97e708b6..41a130aa7 100644
--- a/libs/langchain/langchain/schema/runnable/passthrough.py
+++ b/libs/langchain/langchain/schema/runnable/passthrough.py
@@ -41,7 +41,8 @@ class RunnablePassthrough(Serializable, Runnable[Input, Input]):
) -> Iterator[Input]:
return self._transform_stream_with_config(input, identity, config)
- def atransform(
+ async def atransform(
self, input: AsyncIterator[Input], config: RunnableConfig | None = None
) -> AsyncIterator[Input]:
- return self._atransform_stream_with_config(input, identity, config)
+ async for chunk in self._atransform_stream_with_config(input, identity, config):
+ yield chunk
From bd80cad6dbd045e36afe4be4071d1ef612ff9ea9 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 13:52:19 -0700
Subject: [PATCH 007/143] add
---
.../langchain/schema/runnable/locals.py | 115 ++++++++++++++++++
1 file changed, 115 insertions(+)
create mode 100644 libs/langchain/langchain/schema/runnable/locals.py
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
new file mode 100644
index 000000000..53d8f5a2c
--- /dev/null
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from typing import Any, AsyncIterator, Dict, Iterator, Mapping, Optional, Union
+
+from langchain.load.serializable import Serializable
+from langchain.schema.runnable import Runnable, RunnableConfig, RunnablePassthrough
+from langchain.schema.runnable.base import Input, Output
+
+
+class PutLocalVar(RunnablePassthrough):
+ key: Union[str, Mapping[str, str]]
+ """The key(s) to use for storing the input variable(s) in local state.
+
+ If a string is provided then the entire input is stored under that key. If a
+ Mapping is provided, then the map values are gotten from the input and
+ stored in local state under the map keys.
+ """
+
+ def __init__(self, key: str, **kwargs: Any) -> None:
+ super().__init__(key=key, **kwargs)
+
+ def _put(self, input: Input, *, config: Optional[RunnableConfig] = None) -> None:
+ if config is None:
+ raise ValueError(
+ "PutLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+ if isinstance(self.key, str):
+ config["_locals"][self.key] = input
+ elif isinstance(input, Mapping):
+ for input_key, put_key in self.key.items():
+ config["_locals"][put_key] = input[input_key]
+ else:
+ raise TypeError(
+ f"`key` should be a string or Mapping[str, str], received type "
+ f"{(type(self.key))}."
+ )
+
+ def _concat_put(
+ self, input: Input, *, config: Optional[RunnableConfig] = None
+ ) -> None:
+ if config is None:
+ raise ValueError(
+ "PutLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+ if isinstance(self.key, str):
+ if self.key not in config["_locals"]:
+ config["_locals"][self.key] = input
+ else:
+ config["_locals"][self.key] += input
+ elif isinstance(input, Mapping):
+ for input_key, put_key in self.key.items():
+ if put_key not in config["_locals"]:
+ config["_locals"][put_key] = input
+ else:
+ config["_locals"][put_key] += input
+ else:
+ raise TypeError(
+ f"`key` should be a string or Mapping[str, str], received type "
+ f"{(type(self.key))}."
+ )
+
+ def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Input:
+ self._put(input, config=config)
+ return super().invoke(input, config)
+
+ async def ainvoke(
+ self, input: Input, config: RunnableConfig | None = None
+ ) -> Input:
+ self._put(input, config=config)
+ return await super().ainvoke(input, config)
+
+ def transform(
+ self, input: Iterator[Input], config: RunnableConfig | None = None
+ ) -> Iterator[Input]:
+ for chunk in super().transform(input, config=config):
+ self._concat_put(input, config=config)
+ yield chunk
+
+ async def atransform(
+ self, input: AsyncIterator[Input], config: RunnableConfig | None = None
+ ) -> AsyncIterator[Input]:
+ async for chunk in super().atransform(input, config=config):
+ self._concat_put(input, config=config)
+ yield chunk
+
+
+class GetLocalVar(
+ Serializable, Runnable[Input, Union[Output, Dict[str, Union[Input, Output]]]]
+):
+ key: str
+ """The key to extract from the local state."""
+ passthrough_key: Optional[str] = None
+ """The key to use for passing through the invocation input.
+
+ If None, then only the value retrieved from local state is returned. Otherwise a
+ dictionary ``{self.key: <>, self.passthrough_key: <>}``
+ is returned.
+ """
+
+ def __init__(self, key: str, **kwargs: Any) -> None:
+ super().__init__(key=key, **kwargs)
+
+ def invoke(
+ self, input: Input, config: Optional[RunnableConfig] = None
+ ) -> Union[Output, Dict[str, Union[Input, Output]]]:
+ if config is None:
+ raise ValueError(
+ "PutLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+ if self.passthrough_key is not None:
+ return {self.key: config["_locals"][self.key], self.passthrough_key: input}
+ return config["_locals"][self.key]
From c447e9a854deef90de62bad39991f1ea55a8f29b Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 15:29:00 -0700
Subject: [PATCH 008/143] cr
---
.../langchain/schema/runnable/base.py | 8 ++--
.../langchain/schema/runnable/locals.py | 40 +++++++++++++++----
2 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index ee3f7c114..3f3e90ba2 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -243,8 +243,8 @@ class Runnable(Generic[Input, Output], ABC):
def _call_with_config(
self,
- func: Callable[[Input], Output],
- input: Input,
+ func: Callable[[Any], Output],
+ input: Any,
config: Optional[RunnableConfig],
run_type: Optional[str] = None,
) -> Output:
@@ -273,8 +273,8 @@ class Runnable(Generic[Input, Output], ABC):
async def _acall_with_config(
self,
- func: Callable[[Input], Awaitable[Output]],
- input: Input,
+ func: Callable[[Any], Awaitable[Output]],
+ input: Any,
config: Optional[RunnableConfig],
run_type: Optional[str] = None,
) -> Output:
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
index 53d8f5a2c..cf51336dc 100644
--- a/libs/langchain/langchain/schema/runnable/locals.py
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -16,7 +16,7 @@ class PutLocalVar(RunnablePassthrough):
stored in local state under the map keys.
"""
- def __init__(self, key: str, **kwargs: Any) -> None:
+ def __init__(self, key: Union[str, Mapping[str, str]], **kwargs: Any) -> None:
super().__init__(key=key, **kwargs)
def _put(self, input: Input, *, config: Optional[RunnableConfig] = None) -> None:
@@ -63,13 +63,13 @@ class PutLocalVar(RunnablePassthrough):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Input:
self._put(input, config=config)
- return super().invoke(input, config)
+ return super().invoke(input, config=config)
async def ainvoke(
self, input: Input, config: RunnableConfig | None = None
) -> Input:
self._put(input, config=config)
- return await super().ainvoke(input, config)
+ return await super().ainvoke(input, config=config)
def transform(
self, input: Iterator[Input], config: RunnableConfig | None = None
@@ -102,14 +102,40 @@ class GetLocalVar(
def __init__(self, key: str, **kwargs: Any) -> None:
super().__init__(key=key, **kwargs)
+ def _get(self, full_input: Dict) -> Union[Output, Dict[str, Union[Input, Output]]]:
+ if self.passthrough_key:
+ return {
+ self.key: full_input["locals"][self.key],
+ self.passthrough_key: full_input["input"],
+ }
+ else:
+ return full_input["locals"][self.key]
+
+ async def _aget(
+ self, full_input: Dict
+ ) -> Union[Output, Dict[str, Union[Input, Output]]]:
+ return self._get(full_input)
+
def invoke(
self, input: Input, config: Optional[RunnableConfig] = None
) -> Union[Output, Dict[str, Union[Input, Output]]]:
if config is None:
raise ValueError(
- "PutLocalVar should only be used in a RunnableSequence, and should "
+ "GetLocalVar should only be used in a RunnableSequence, and should "
"therefore always receive a non-null config."
)
- if self.passthrough_key is not None:
- return {self.key: config["_locals"][self.key], self.passthrough_key: input}
- return config["_locals"][self.key]
+
+ log_input = {"input": input, "locals": config["_locals"]}
+ return self._call_with_config(self._get, log_input, config)
+
+ async def ainvoke(
+ self, input: Input, config: Optional[RunnableConfig] = None
+ ) -> Union[Output, Dict[str, Union[Input, Output]]]:
+ if config is None:
+ raise ValueError(
+ "GetLocalVar should only be used in a RunnableSequence, and should "
+ "therefore always receive a non-null config."
+ )
+
+ log_input = {"input": input, "locals": config["_locals"]}
+ return await self._acall_with_config(self._aget, log_input, config)
From 6b0a849f5953b05eab530cbceffe5ab6b44c3a72 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 16:22:12 -0700
Subject: [PATCH 009/143] fix
---
.../langchain/schema/runnable/__init__.py | 3 ++
.../langchain/schema/runnable/base.py | 11 +++++---
.../langchain/schema/runnable/locals.py | 28 +++++++++++++------
3 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/__init__.py b/libs/langchain/langchain/schema/runnable/__init__.py
index 0dbabd157..bae6aebb0 100644
--- a/libs/langchain/langchain/schema/runnable/__init__.py
+++ b/libs/langchain/langchain/schema/runnable/__init__.py
@@ -7,10 +7,13 @@ from langchain.schema.runnable.base import (
RunnableWithFallbacks,
)
from langchain.schema.runnable.config import RunnableConfig
+from langchain.schema.runnable.locals import GetLocalVar, PutLocalVar
from langchain.schema.runnable.passthrough import RunnablePassthrough
from langchain.schema.runnable.router import RouterInput, RouterRunnable
__all__ = [
+ "GetLocalVar",
+ "PutLocalVar",
"RouterInput",
"RouterRunnable",
"Runnable",
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 3f3e90ba2..704a518cd 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -238,7 +238,10 @@ class Runnable(Generic[Input, Output], ABC):
return (
config
if isinstance(config, list)
- else [deepcopy(config) if config is not None else {} for _ in range(length)]
+ else [
+ deepcopy(config) if config is not None else _empty_config()
+ for _ in range(length)
+ ]
)
def _call_with_config(
@@ -750,7 +753,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
@@ -896,7 +899,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Iterator[Output]:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
@@ -963,7 +966,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> AsyncIterator[Output]:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
index cf51336dc..65e63507b 100644
--- a/libs/langchain/langchain/schema/runnable/locals.py
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -3,8 +3,9 @@ from __future__ import annotations
from typing import Any, AsyncIterator, Dict, Iterator, Mapping, Optional, Union
from langchain.load.serializable import Serializable
-from langchain.schema.runnable import Runnable, RunnableConfig, RunnablePassthrough
-from langchain.schema.runnable.base import Input, Output
+from langchain.schema.runnable.base import Input, Output, Runnable
+from langchain.schema.runnable.config import RunnableConfig
+from langchain.schema.runnable.passthrough import RunnablePassthrough
class PutLocalVar(RunnablePassthrough):
@@ -27,7 +28,12 @@ class PutLocalVar(RunnablePassthrough):
)
if isinstance(self.key, str):
config["_locals"][self.key] = input
- elif isinstance(input, Mapping):
+ elif isinstance(self.key, Mapping):
+ if not isinstance(input, Mapping):
+ raise TypeError(
+ f"Received key of type Mapping but input of type {type(input)}. "
+ f"input is expected to be of type Mapping when key is Mapping."
+ )
for input_key, put_key in self.key.items():
config["_locals"][put_key] = input[input_key]
else:
@@ -44,17 +50,23 @@ class PutLocalVar(RunnablePassthrough):
"PutLocalVar should only be used in a RunnableSequence, and should "
"therefore always receive a non-null config."
)
+ print(config)
if isinstance(self.key, str):
if self.key not in config["_locals"]:
config["_locals"][self.key] = input
else:
config["_locals"][self.key] += input
- elif isinstance(input, Mapping):
+ elif isinstance(self.key, Mapping):
+ if not isinstance(input, Mapping):
+ raise TypeError(
+ f"Received key of type Mapping but input of type {type(input)}. "
+ f"input is expected to be of type Mapping when key is Mapping."
+ )
for input_key, put_key in self.key.items():
if put_key not in config["_locals"]:
- config["_locals"][put_key] = input
+ config["_locals"][put_key] = input[input_key]
else:
- config["_locals"][put_key] += input
+ config["_locals"][put_key] += input[input_key]
else:
raise TypeError(
f"`key` should be a string or Mapping[str, str], received type "
@@ -75,14 +87,14 @@ class PutLocalVar(RunnablePassthrough):
self, input: Iterator[Input], config: RunnableConfig | None = None
) -> Iterator[Input]:
for chunk in super().transform(input, config=config):
- self._concat_put(input, config=config)
+ self._concat_put(chunk, config=config)
yield chunk
async def atransform(
self, input: AsyncIterator[Input], config: RunnableConfig | None = None
) -> AsyncIterator[Input]:
async for chunk in super().atransform(input, config=config):
- self._concat_put(input, config=config)
+ self._concat_put(chunk, config=config)
yield chunk
From 9e906c39ba974ae33d596174873a173b505648e9 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 16:22:22 -0700
Subject: [PATCH 010/143] nit
---
libs/langchain/langchain/schema/runnable/locals.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
index 65e63507b..5061dbf38 100644
--- a/libs/langchain/langchain/schema/runnable/locals.py
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -50,7 +50,6 @@ class PutLocalVar(RunnablePassthrough):
"PutLocalVar should only be used in a RunnableSequence, and should "
"therefore always receive a non-null config."
)
- print(config)
if isinstance(self.key, str):
if self.key not in config["_locals"]:
config["_locals"][self.key] = input
From 6f69b19ff583a37387b8403f15fae7bfbcede4ba Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 16:45:52 -0700
Subject: [PATCH 011/143] wip tests
---
.../unit_tests/schema/runnable/__init__.py | 0
.../unit_tests/schema/runnable/test_locals.py | 31 +++++++++++++++++++
.../schema/{ => runnable}/test_runnable.py | 0
3 files changed, 31 insertions(+)
create mode 100644 libs/langchain/tests/unit_tests/schema/runnable/__init__.py
create mode 100644 libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
rename libs/langchain/tests/unit_tests/schema/{ => runnable}/test_runnable.py (100%)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/__init__.py b/libs/langchain/tests/unit_tests/schema/runnable/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
new file mode 100644
index 000000000..d0a3fb38d
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
@@ -0,0 +1,31 @@
+import pytest
+
+from langchain.schema.runnable import GetLocalVar, PutLocalVar
+
+
+@pytest.mark.asyncio
+async def test_put_get() -> None:
+ runnable = PutLocalVar("input") | GetLocalVar("input")
+ assert runnable.invoke("foo") == "foo"
+ assert runnable.batch(["foo", "bar"]) == ["foo", "bar"]
+ assert list(runnable.stream("foo"))[0] == "foo"
+
+ assert await runnable.ainvoke("foo") == "foo"
+ assert await runnable.abatch(["foo", "bar"]) == ["foo", "bar"]
+ async for x in runnable.astream("foo"):
+ assert x == "foo"
+
+
+def test_missing_config() -> None:
+ with pytest.raises(ValueError):
+ PutLocalVar("input").invoke("foo")
+
+ with pytest.raises(ValueError):
+ GetLocalVar("input").invoke("foo")
+
+
+def test_get_missing_var_invoke() -> None:
+ runnable = PutLocalVar("input") | GetLocalVar("missing")
+ with pytest.raises(KeyError):
+ runnable.invoke("foo")
+
diff --git a/libs/langchain/tests/unit_tests/schema/test_runnable.py b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
similarity index 100%
rename from libs/langchain/tests/unit_tests/schema/test_runnable.py
rename to libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
From ab21af71be3c5a2fbe548061228df525c635ba86 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Thu, 17 Aug 2023 17:28:02 -0700
Subject: [PATCH 012/143] wip
---
.../langchain/schema/runnable/base.py | 18 +++++-----
.../unit_tests/schema/runnable/test_locals.py | 36 ++++++++++++++++++-
2 files changed, 44 insertions(+), 10 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 704a518cd..c91456394 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -253,7 +253,7 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement invoke() in subclasses."""
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
@@ -283,7 +283,7 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement ainvoke() in subclasses."""
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
@@ -322,7 +322,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
@@ -387,7 +387,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
@@ -462,7 +462,7 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
@@ -495,7 +495,7 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
@@ -1068,7 +1068,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
from langchain.callbacks.manager import CallbackManager
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = CallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
local_callbacks=None,
@@ -1091,7 +1091,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
step.invoke,
input,
# mark each step as a child run
- patch_config(config, run_manager.get_child()),
+ patch_config(deepcopy(config), run_manager.get_child()),
)
for step in steps.values()
]
@@ -1108,7 +1108,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Dict[str, Any]:
# setup callbacks
- config = config or {}
+ config = config or _empty_config()
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
index d0a3fb38d..dce548fc6 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
@@ -1,6 +1,13 @@
import pytest
-from langchain.schema.runnable import GetLocalVar, PutLocalVar
+from langchain import PromptTemplate
+from langchain.llms import FakeListLLM
+from langchain.schema.runnable import (
+ GetLocalVar,
+ PutLocalVar,
+ RunnablePassthrough,
+ RunnableSequence,
+)
@pytest.mark.asyncio
@@ -29,3 +36,30 @@ def test_get_missing_var_invoke() -> None:
with pytest.raises(KeyError):
runnable.invoke("foo")
+
+def test_get_in_map() -> None:
+ runnable: RunnableSequence = PutLocalVar("input") | {"bar": GetLocalVar("input")}
+ assert runnable.invoke("foo") == {"bar": "foo"}
+
+
+def test_cant_put_in_map() -> None:
+ runnable: RunnableSequence = {"bar": PutLocalVar("input")} | GetLocalVar("input")
+ with pytest.raises(KeyError):
+ runnable.invoke("foo")
+
+
+def test_get_passthrough_key() -> None:
+ runnable = PutLocalVar("input") | GetLocalVar("input", passthrough_key="output")
+ assert runnable.invoke("foo") == {"input": "foo", "output": "foo"}
+
+
+def test_multi_step_sequence() -> None:
+ prompt = PromptTemplate.from_template("say {foo}")
+ runnable = (
+ PutLocalVar("foo")
+ | {"foo": RunnablePassthrough()}
+ | prompt
+ | FakeListLLM(responses=["bar"])
+ | GetLocalVar("foo", passthrough_key="output")
+ )
+ assert runnable.invoke("hello") == {"foo": "hello", "output": "bar"}
From 7fe474d19820e5ffd65d30f03446301a644d8a7c Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:02:11 +0100
Subject: [PATCH 013/143] Update snapshots
---
.../schema/{ => runnable}/__snapshots__/test_runnable.ambr | 1 +
1 file changed, 1 insertion(+)
rename libs/langchain/tests/unit_tests/schema/{ => runnable}/__snapshots__/test_runnable.ambr (99%)
diff --git a/libs/langchain/tests/unit_tests/schema/__snapshots__/test_runnable.ambr b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
similarity index 99%
rename from libs/langchain/tests/unit_tests/schema/__snapshots__/test_runnable.ambr
rename to libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
index 2d2872c14..4a59ae630 100644
--- a/libs/langchain/tests/unit_tests/schema/__snapshots__/test_runnable.ambr
+++ b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
@@ -1331,6 +1331,7 @@
"lc": 1,
"type": "not_implemented",
"id": [
+ "runnable",
"test_runnable",
"FakeRetriever"
]
From c1b1666ec850e465bf93bd01d34e09fc457076cc Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:02:29 +0100
Subject: [PATCH 014/143] Ensure config defaults apply even when a config is
passed in
---
.../langchain/schema/runnable/base.py | 36 +++++++++----------
.../schema/runnable/test_runnable.py | 18 +++++++---
2 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index c91456394..0d9df2bae 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -35,8 +35,11 @@ from langchain.schema.runnable.utils import (
from langchain.utils.aiter import atee, py_anext
-def _empty_config() -> RunnableConfig:
- return RunnableConfig(tags=[], metadata={}, callbacks=None, _locals={})
+def _ensure_config(config: Optional[RunnableConfig]) -> RunnableConfig:
+ empty = RunnableConfig(tags=[], metadata={}, callbacks=None, _locals={})
+ if config is not None:
+ empty.update(config)
+ return empty
def _get_callback_manager(config: Mapping) -> Any:
@@ -238,10 +241,7 @@ class Runnable(Generic[Input, Output], ABC):
return (
config
if isinstance(config, list)
- else [
- deepcopy(config) if config is not None else _empty_config()
- for _ in range(length)
- ]
+ else [deepcopy(_ensure_config(config)) for _ in range(length)]
)
def _call_with_config(
@@ -253,7 +253,7 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement invoke() in subclasses."""
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
@@ -283,7 +283,7 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement ainvoke() in subclasses."""
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
@@ -322,7 +322,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_callback_manager(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
@@ -387,7 +387,7 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
@@ -462,7 +462,7 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
@@ -495,7 +495,7 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
@@ -724,7 +724,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
@@ -753,7 +753,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
@@ -899,7 +899,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Iterator[Output]:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_callback_manager(config)
# start the root run
run_manager = callback_manager.on_chain_start(
@@ -966,7 +966,7 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> AsyncIterator[Output]:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
@@ -1068,7 +1068,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
from langchain.callbacks.manager import CallbackManager
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = CallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
local_callbacks=None,
@@ -1108,7 +1108,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Dict[str, Any]:
# setup callbacks
- config = config or _empty_config()
+ config = _ensure_config(config)
callback_manager = _get_async_callback_manager(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
index c0cae4d9b..8bfecb182 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
@@ -134,8 +134,12 @@ async def test_default_method_implementations(mocker: MockerFixture) -> None:
assert fake.batch(["hello", "wooorld"], dict(tags=["a-tag"])) == [5, 7]
assert spy.call_args_list == [
- mocker.call("hello", dict(tags=["a-tag"])),
- mocker.call("wooorld", dict(tags=["a-tag"])),
+ mocker.call(
+ "hello", dict(tags=["a-tag"], metadata={}, callbacks=None, _locals={})
+ ),
+ mocker.call(
+ "wooorld", dict(tags=["a-tag"], metadata={}, callbacks=None, _locals={})
+ ),
]
spy.reset_mock()
@@ -156,8 +160,14 @@ async def test_default_method_implementations(mocker: MockerFixture) -> None:
7,
]
assert spy.call_args_list == [
- mocker.call("hello", dict(metadata={"key": "value"})),
- mocker.call("wooorld", dict(metadata={"key": "value"})),
+ mocker.call(
+ "hello",
+ dict(metadata={"key": "value"}, tags=[], callbacks=None, _locals={}),
+ ),
+ mocker.call(
+ "wooorld",
+ dict(metadata={"key": "value"}, tags=[], callbacks=None, _locals={}),
+ ),
]
From a5e7dcec61cdcaf2c075b5e83117ee3ab14e92c3 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:03:28 +0100
Subject: [PATCH 015/143] Lint
---
libs/langchain/tests/unit_tests/schema/runnable/test_locals.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
index dce548fc6..8f8755a96 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
@@ -28,7 +28,7 @@ def test_missing_config() -> None:
PutLocalVar("input").invoke("foo")
with pytest.raises(ValueError):
- GetLocalVar("input").invoke("foo")
+ GetLocalVar[str, str]("input").invoke("foo")
def test_get_missing_var_invoke() -> None:
From 8ddaaf3d4100ddbc6fc8e7fa9df39d6fc6a67c9a Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:10:35 +0100
Subject: [PATCH 016/143] Move config helpers
---
.../langchain/schema/runnable/base.py | 82 +++++++------------
.../langchain/schema/runnable/config.py | 28 ++++++-
2 files changed, 57 insertions(+), 53 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 0d9df2bae..5a1d5b29e 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -28,40 +28,18 @@ from langchain.callbacks.base import BaseCallbackManager
from langchain.load.dump import dumpd
from langchain.load.serializable import Serializable
from langchain.pydantic_v1 import Field
-from langchain.schema.runnable.config import RunnableConfig
+from langchain.schema.runnable.config import (
+ RunnableConfig,
+ ensure_config,
+ get_async_callback_manager_for_config,
+ get_callback_manager_for_config,
+)
from langchain.schema.runnable.utils import (
gather_with_concurrency,
)
from langchain.utils.aiter import atee, py_anext
-def _ensure_config(config: Optional[RunnableConfig]) -> RunnableConfig:
- empty = RunnableConfig(tags=[], metadata={}, callbacks=None, _locals={})
- if config is not None:
- empty.update(config)
- return empty
-
-
-def _get_callback_manager(config: Mapping) -> Any:
- from langchain.callbacks.manager import CallbackManager
-
- return CallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
-
-
-def _get_async_callback_manager(config: Mapping) -> Any:
- from langchain.callbacks.manager import AsyncCallbackManager
-
- return AsyncCallbackManager.configure(
- inheritable_callbacks=config.get("callbacks"),
- inheritable_tags=config.get("tags"),
- inheritable_metadata=config.get("metadata"),
- )
-
-
Input = TypeVar("Input")
# Output type should implement __concat__, as eg str, list, dict do
Output = TypeVar("Output")
@@ -241,7 +219,7 @@ class Runnable(Generic[Input, Output], ABC):
return (
config
if isinstance(config, list)
- else [deepcopy(_ensure_config(config)) for _ in range(length)]
+ else [deepcopy(ensure_config(config)) for _ in range(length)]
)
def _call_with_config(
@@ -253,8 +231,8 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement invoke() in subclasses."""
- config = _ensure_config(config)
- callback_manager = _get_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_callback_manager_for_config(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
input if isinstance(input, dict) else {"input": input},
@@ -283,8 +261,8 @@ class Runnable(Generic[Input, Output], ABC):
) -> Output:
"""Helper method to transform an Input value to an Output value,
with callbacks. Use this method to implement ainvoke() in subclasses."""
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
input if isinstance(input, dict) else {"input": input},
@@ -322,8 +300,8 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = _ensure_config(config)
- callback_manager = _get_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_callback_manager_for_config(config)
run_manager = callback_manager.on_chain_start(
dumpd(self),
{"input": ""},
@@ -387,8 +365,8 @@ class Runnable(Generic[Input, Output], ABC):
final_output: Optional[Output] = None
final_output_supported = True
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
run_manager = await callback_manager.on_chain_start(
dumpd(self),
{"input": ""},
@@ -462,8 +440,8 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_callback_manager_for_config(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -495,8 +473,8 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -724,8 +702,8 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Output:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_callback_manager_for_config(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -753,8 +731,8 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Output:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -899,8 +877,8 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Iterator[Output]:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_callback_manager_for_config(config)
# start the root run
run_manager = callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -966,8 +944,8 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> AsyncIterator[Output]:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), input if isinstance(input, dict) else {"input": input}
@@ -1068,7 +1046,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
from langchain.callbacks.manager import CallbackManager
# setup callbacks
- config = _ensure_config(config)
+ config = ensure_config(config)
callback_manager = CallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
local_callbacks=None,
@@ -1108,8 +1086,8 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
self, input: Input, config: Optional[RunnableConfig] = None
) -> Dict[str, Any]:
# setup callbacks
- config = _ensure_config(config)
- callback_manager = _get_async_callback_manager(config)
+ config = ensure_config(config)
+ callback_manager = get_async_callback_manager_for_config(config)
# start the root run
run_manager = await callback_manager.on_chain_start(
dumpd(self), {"input": input}
diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py
index f2bf28fcb..cd620077e 100644
--- a/libs/langchain/langchain/schema/runnable/config.py
+++ b/libs/langchain/langchain/schema/runnable/config.py
@@ -1,8 +1,9 @@
from __future__ import annotations
-from typing import Any, Dict, List, TypedDict
+from typing import Any, Dict, List, Optional, TypedDict
from langchain.callbacks.base import Callbacks
+from langchain.callbacks.manager import CallbackManager, AsyncCallbackManager
class RunnableConfig(TypedDict, total=False):
@@ -30,3 +31,28 @@ class RunnableConfig(TypedDict, total=False):
"""
Local variables
"""
+
+
+def ensure_config(config: Optional[RunnableConfig]) -> RunnableConfig:
+ empty = RunnableConfig(tags=[], metadata={}, callbacks=None, _locals={})
+ if config is not None:
+ empty.update(config)
+ return empty
+
+
+def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
+ return CallbackManager.configure(
+ inheritable_callbacks=config.get("callbacks"),
+ inheritable_tags=config.get("tags"),
+ inheritable_metadata=config.get("metadata"),
+ )
+
+
+def get_async_callback_manager_for_config(
+ config: RunnableConfig,
+) -> AsyncCallbackManager:
+ return AsyncCallbackManager.configure(
+ inheritable_callbacks=config.get("callbacks"),
+ inheritable_tags=config.get("tags"),
+ inheritable_metadata=config.get("metadata"),
+ )
From 46f3850794f5fc14477d5545c6d1edd6bbfeca1a Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:25:41 +0100
Subject: [PATCH 017/143] Lint
---
.../langchain/schema/runnable/base.py | 1 -
.../langchain/schema/runnable/config.py | 2 +-
.../langchain/schema/runnable/locals.py | 78 +++++++++----------
3 files changed, 39 insertions(+), 42 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index c0caa6d9a..1ca853174 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -52,7 +52,6 @@ from langchain.schema.runnable.utils import (
from langchain.utils.aiter import atee, py_anext
from langchain.utils.iter import safetee
-
Input = TypeVar("Input")
# Output type should implement __concat__, as eg str, list, dict do
Output = TypeVar("Output")
diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py
index cd620077e..716fc3611 100644
--- a/libs/langchain/langchain/schema/runnable/config.py
+++ b/libs/langchain/langchain/schema/runnable/config.py
@@ -3,7 +3,7 @@ from __future__ import annotations
from typing import Any, Dict, List, Optional, TypedDict
from langchain.callbacks.base import Callbacks
-from langchain.callbacks.manager import CallbackManager, AsyncCallbackManager
+from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
class RunnableConfig(TypedDict, total=False):
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
index 5061dbf38..6d668059e 100644
--- a/libs/langchain/langchain/schema/runnable/locals.py
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -2,6 +2,10 @@ from __future__ import annotations
from typing import Any, AsyncIterator, Dict, Iterator, Mapping, Optional, Union
+from langchain.callbacks.manager import (
+ AsyncCallbackManagerForChainRun,
+ CallbackManagerForChainRun,
+)
from langchain.load.serializable import Serializable
from langchain.schema.runnable.base import Input, Output, Runnable
from langchain.schema.runnable.config import RunnableConfig
@@ -20,30 +24,12 @@ class PutLocalVar(RunnablePassthrough):
def __init__(self, key: Union[str, Mapping[str, str]], **kwargs: Any) -> None:
super().__init__(key=key, **kwargs)
- def _put(self, input: Input, *, config: Optional[RunnableConfig] = None) -> None:
- if config is None:
- raise ValueError(
- "PutLocalVar should only be used in a RunnableSequence, and should "
- "therefore always receive a non-null config."
- )
- if isinstance(self.key, str):
- config["_locals"][self.key] = input
- elif isinstance(self.key, Mapping):
- if not isinstance(input, Mapping):
- raise TypeError(
- f"Received key of type Mapping but input of type {type(input)}. "
- f"input is expected to be of type Mapping when key is Mapping."
- )
- for input_key, put_key in self.key.items():
- config["_locals"][put_key] = input[input_key]
- else:
- raise TypeError(
- f"`key` should be a string or Mapping[str, str], received type "
- f"{(type(self.key))}."
- )
-
def _concat_put(
- self, input: Input, *, config: Optional[RunnableConfig] = None
+ self,
+ input: Input,
+ *,
+ config: Optional[RunnableConfig] = None,
+ replace: bool = False,
) -> None:
if config is None:
raise ValueError(
@@ -51,7 +37,7 @@ class PutLocalVar(RunnablePassthrough):
"therefore always receive a non-null config."
)
if isinstance(self.key, str):
- if self.key not in config["_locals"]:
+ if self.key not in config["_locals"] or replace:
config["_locals"][self.key] = input
else:
config["_locals"][self.key] += input
@@ -62,7 +48,7 @@ class PutLocalVar(RunnablePassthrough):
f"input is expected to be of type Mapping when key is Mapping."
)
for input_key, put_key in self.key.items():
- if put_key not in config["_locals"]:
+ if put_key not in config["_locals"] or replace:
config["_locals"][put_key] = input[input_key]
else:
config["_locals"][put_key] += input[input_key]
@@ -73,24 +59,30 @@ class PutLocalVar(RunnablePassthrough):
)
def invoke(self, input: Input, config: Optional[RunnableConfig] = None) -> Input:
- self._put(input, config=config)
+ self._concat_put(input, config=config, replace=True)
return super().invoke(input, config=config)
async def ainvoke(
- self, input: Input, config: RunnableConfig | None = None
+ self, input: Input, config: Optional[RunnableConfig] = None
) -> Input:
- self._put(input, config=config)
+ self._concat_put(input, config=config, replace=True)
return await super().ainvoke(input, config=config)
def transform(
- self, input: Iterator[Input], config: RunnableConfig | None = None
+ self,
+ input: Iterator[Input],
+ config: Optional[RunnableConfig] = None,
+ **kwargs: Optional[Any],
) -> Iterator[Input]:
for chunk in super().transform(input, config=config):
self._concat_put(chunk, config=config)
yield chunk
async def atransform(
- self, input: AsyncIterator[Input], config: RunnableConfig | None = None
+ self,
+ input: AsyncIterator[Input],
+ config: Optional[RunnableConfig] = None,
+ **kwargs: Optional[Any],
) -> AsyncIterator[Input]:
async for chunk in super().atransform(input, config=config):
self._concat_put(chunk, config=config)
@@ -113,19 +105,27 @@ class GetLocalVar(
def __init__(self, key: str, **kwargs: Any) -> None:
super().__init__(key=key, **kwargs)
- def _get(self, full_input: Dict) -> Union[Output, Dict[str, Union[Input, Output]]]:
+ def _get(
+ self,
+ input: Input,
+ run_manager: Union[CallbackManagerForChainRun, Any],
+ config: RunnableConfig,
+ ) -> Union[Output, Dict[str, Union[Input, Output]]]:
if self.passthrough_key:
return {
- self.key: full_input["locals"][self.key],
- self.passthrough_key: full_input["input"],
+ self.key: config["_locals"][self.key],
+ self.passthrough_key: input,
}
else:
- return full_input["locals"][self.key]
+ return config["_locals"][self.key]
async def _aget(
- self, full_input: Dict
+ self,
+ input: Input,
+ run_manager: AsyncCallbackManagerForChainRun,
+ config: RunnableConfig,
) -> Union[Output, Dict[str, Union[Input, Output]]]:
- return self._get(full_input)
+ return self._get(input, run_manager, config)
def invoke(
self, input: Input, config: Optional[RunnableConfig] = None
@@ -136,8 +136,7 @@ class GetLocalVar(
"therefore always receive a non-null config."
)
- log_input = {"input": input, "locals": config["_locals"]}
- return self._call_with_config(self._get, log_input, config)
+ return self._call_with_config(self._get, input, config)
async def ainvoke(
self, input: Input, config: Optional[RunnableConfig] = None
@@ -148,5 +147,4 @@ class GetLocalVar(
"therefore always receive a non-null config."
)
- log_input = {"input": input, "locals": config["_locals"]}
- return await self._acall_with_config(self._aget, log_input, config)
+ return await self._acall_with_config(self._aget, input, config)
From 1baedc4e1802fc13de49116ba6becd20c0860b71 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:28:39 +0100
Subject: [PATCH 018/143] Move patch_config
---
libs/langchain/langchain/schema/runnable/base.py | 14 +-------------
libs/langchain/langchain/schema/runnable/config.py | 11 ++++++++++-
2 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 1ca853174..5fec1c86c 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -34,7 +34,6 @@ if TYPE_CHECKING:
)
-from langchain.callbacks.base import BaseCallbackManager
from langchain.load.dump import dumpd
from langchain.load.serializable import Serializable
from langchain.pydantic_v1 import Field
@@ -43,6 +42,7 @@ from langchain.schema.runnable.config import (
ensure_config,
get_async_callback_manager_for_config,
get_callback_manager_for_config,
+ patch_config,
)
from langchain.schema.runnable.utils import (
accepts_run_manager,
@@ -1472,18 +1472,6 @@ class RunnableBinding(Serializable, Runnable[Input, Output]):
yield item
-def patch_config(
- config: RunnableConfig,
- callback_manager: BaseCallbackManager,
- _locals: Optional[Dict[str, Any]] = None,
-) -> RunnableConfig:
- config = config.copy()
- config["callbacks"] = callback_manager
- if _locals is not None:
- config["_locals"] = _locals
- return config
-
-
def coerce_to_runnable(
thing: Union[
Runnable[Input, Output],
diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py
index 716fc3611..00408b7ee 100644
--- a/libs/langchain/langchain/schema/runnable/config.py
+++ b/libs/langchain/langchain/schema/runnable/config.py
@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Any, Dict, List, Optional, TypedDict
-from langchain.callbacks.base import Callbacks
+from langchain.callbacks.base import BaseCallbackManager, Callbacks
from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
@@ -40,6 +40,15 @@ def ensure_config(config: Optional[RunnableConfig]) -> RunnableConfig:
return empty
+def patch_config(
+ config: RunnableConfig,
+ callbacks: BaseCallbackManager,
+) -> RunnableConfig:
+ config = config.copy()
+ config["callbacks"] = callbacks
+ return config
+
+
def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
return CallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
From ddcb4ff5fb3f0ccf3871c2c86744fd8daa436435 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:30:42 +0100
Subject: [PATCH 019/143] Li t
---
libs/langchain/langchain/smith/evaluation/runner_utils.py | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py
index cc3c46dff..64139f95e 100644
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -654,9 +654,7 @@ async def _arun_chain(
else:
output = await chain.acall(inputs_, callbacks=callbacks, tags=tags)
else:
- runnable_config = RunnableConfig(
- tags=tags or [], callbacks=callbacks, _locals={}
- )
+ runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
output = await chain.ainvoke(inputs_, config=runnable_config)
return output
@@ -977,9 +975,7 @@ def _run_chain(
else:
output = chain(inputs_, callbacks=callbacks, tags=tags)
else:
- runnable_config = RunnableConfig(
- tags=tags or [], callbacks=callbacks, _locals={}
- )
+ runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
output = chain.invoke(inputs_, config=runnable_config)
return output
From 6ae58da668f375d4bd5ae162fe21bf1f140ffc36 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 10:53:10 +0100
Subject: [PATCH 020/143] Assign defaults in batch calls
---
libs/langchain/langchain/schema/runnable/base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index 5fec1c86c..fcba9c4c1 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -235,7 +235,7 @@ class Runnable(Generic[Input, Output], ABC):
)
return (
- config
+ list(map(ensure_config, config))
if isinstance(config, list)
else [deepcopy(ensure_config(config)) for _ in range(length)]
)
From d3f10d2f4f49c88747836f281a1651e696f11e20 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 11:36:16 +0100
Subject: [PATCH 021/143] Update test
---
.../tests/unit_tests/schema/runnable/test_runnable.py | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
index 4d02a07df..5d140d2ad 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py
@@ -127,8 +127,13 @@ async def test_default_method_implementations(mocker: MockerFixture) -> None:
["hello", "wooorld"], [dict(tags=["a-tag"]), dict(metadata={"key": "value"})]
) == [5, 7]
assert spy.call_args_list == [
- mocker.call("hello", dict(tags=["a-tag"])),
- mocker.call("wooorld", dict(metadata={"key": "value"})),
+ mocker.call(
+ "hello", dict(tags=["a-tag"], metadata={}, callbacks=None, _locals={})
+ ),
+ mocker.call(
+ "wooorld",
+ dict(metadata={"key": "value"}, tags=[], callbacks=None, _locals={}),
+ ),
]
spy.reset_mock()
From 354c42afd20e9cf93ff1a6cd263b4372c5136b22 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Fri, 18 Aug 2023 15:30:30 +0100
Subject: [PATCH 022/143] Lint
---
libs/langchain/langchain/schema/runnable/locals.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/locals.py b/libs/langchain/langchain/schema/runnable/locals.py
index 6d668059e..755a709fc 100644
--- a/libs/langchain/langchain/schema/runnable/locals.py
+++ b/libs/langchain/langchain/schema/runnable/locals.py
@@ -63,7 +63,10 @@ class PutLocalVar(RunnablePassthrough):
return super().invoke(input, config=config)
async def ainvoke(
- self, input: Input, config: Optional[RunnableConfig] = None
+ self,
+ input: Input,
+ config: Optional[RunnableConfig] = None,
+ **kwargs: Optional[Any],
) -> Input:
self._concat_put(input, config=config, replace=True)
return await super().ainvoke(input, config=config)
@@ -139,7 +142,10 @@ class GetLocalVar(
return self._call_with_config(self._get, input, config)
async def ainvoke(
- self, input: Input, config: Optional[RunnableConfig] = None
+ self,
+ input: Input,
+ config: Optional[RunnableConfig] = None,
+ **kwargs: Optional[Any],
) -> Union[Output, Dict[str, Union[Input, Output]]]:
if config is None:
raise ValueError(
From 5cd244e9b7217824c01e271e30f81ef9b923d79c Mon Sep 17 00:00:00 2001
From: Taqi Jaffri
Date: Sat, 19 Aug 2023 13:48:15 -0700
Subject: [PATCH 023/143] CR feedback
---
.../integrations/document_loaders/docugami.ipynb | 12 ++----------
.../langchain/langchain/document_loaders/docugami.py | 1 +
2 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/docs/extras/integrations/document_loaders/docugami.ipynb b/docs/extras/integrations/document_loaders/docugami.ipynb
index d3f94a8d1..607cf2b14 100644
--- a/docs/extras/integrations/document_loaders/docugami.ipynb
+++ b/docs/extras/integrations/document_loaders/docugami.ipynb
@@ -19,18 +19,10 @@
"metadata": {
"tags": []
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: lxml in /root/Source/github/docugami.langchain/libs/langchain/.venv/lib/python3.9/site-packages (4.9.3)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# You need the lxml package to use the DocugamiLoader\n",
- "!poetry run pip install lxml"
+ "!poetry run pip install lxml --quiet"
]
},
{
diff --git a/libs/langchain/langchain/document_loaders/docugami.py b/libs/langchain/langchain/document_loaders/docugami.py
index af2c95f57..cd05a9612 100644
--- a/libs/langchain/langchain/document_loaders/docugami.py
+++ b/libs/langchain/langchain/document_loaders/docugami.py
@@ -147,6 +147,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document[DOCUMENT_ID_KEY],
+ DOCUMENT_NAME_KEY: document[DOCUMENT_NAME_KEY],
DOCUMENT_SOURCE_KEY: document[DOCUMENT_NAME_KEY],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
From 069c0a041f95d2672b0eb4a707cc8d6e5a883005 Mon Sep 17 00:00:00 2001
From: Taqi Jaffri
Date: Sat, 19 Aug 2023 13:50:16 -0700
Subject: [PATCH 024/143] comment update for poetry install
---
docs/extras/integrations/document_loaders/docugami.ipynb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/extras/integrations/document_loaders/docugami.ipynb b/docs/extras/integrations/document_loaders/docugami.ipynb
index 607cf2b14..48db3ae81 100644
--- a/docs/extras/integrations/document_loaders/docugami.ipynb
+++ b/docs/extras/integrations/document_loaders/docugami.ipynb
@@ -21,7 +21,7 @@
},
"outputs": [],
"source": [
- "# You need the lxml package to use the DocugamiLoader\n",
+ "# You need the lxml package to use the DocugamiLoader (run pip install directly without \"poetry run\" if you are not using poetry)\n",
"!poetry run pip install lxml --quiet"
]
},
From 182b059bf4d6bfbbd3204a83985dbe90e9613285 Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Mon, 21 Aug 2023 17:31:38 -0700
Subject: [PATCH 025/143] param
---
.../unit_tests/schema/runnable/test_locals.py | 84 +++++++++++--------
1 file changed, 49 insertions(+), 35 deletions(-)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
index 8f8755a96..0430c03c8 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
@@ -1,3 +1,5 @@
+from typing import Any, Callable, Type
+
import pytest
from langchain import PromptTemplate
@@ -10,30 +12,42 @@ from langchain.schema.runnable import (
)
-@pytest.mark.asyncio
-async def test_put_get() -> None:
+@pytest.mark.parametrize(
+ ("method", "input", "output"),
+ [
+ (lambda r, x: r.invoke(x), "foo", "foo"),
+ (lambda r, x: r.batch(x), ["foo", "bar"], ["foo", "bar"]),
+ (lambda r, x: list(r.stream(x))[0], "foo", "foo"),
+ ],
+)
+def test_put_get(method: Callable, input: Any, output: Any) -> None:
runnable = PutLocalVar("input") | GetLocalVar("input")
- assert runnable.invoke("foo") == "foo"
- assert runnable.batch(["foo", "bar"]) == ["foo", "bar"]
- assert list(runnable.stream("foo"))[0] == "foo"
-
- assert await runnable.ainvoke("foo") == "foo"
- assert await runnable.abatch(["foo", "bar"]) == ["foo", "bar"]
- async for x in runnable.astream("foo"):
- assert x == "foo"
+ assert method(runnable, input) == output
-def test_missing_config() -> None:
- with pytest.raises(ValueError):
- PutLocalVar("input").invoke("foo")
-
- with pytest.raises(ValueError):
- GetLocalVar[str, str]("input").invoke("foo")
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+ ("method", "input", "output"),
+ [
+ (lambda r, x: r.ainvoke(x), "foo", "foo"),
+ (lambda r, x: r.abatch(x), ["foo", "bar"], ["foo", "bar"]),
+ ],
+)
+async def test_put_get_async(method: Callable, input: Any, output: Any) -> None:
+ runnable = PutLocalVar("input") | GetLocalVar("input")
+ assert await method(runnable, input) == output
-def test_get_missing_var_invoke() -> None:
- runnable = PutLocalVar("input") | GetLocalVar("missing")
- with pytest.raises(KeyError):
+@pytest.mark.parametrize(
+ ("runnable", "error"),
+ [
+ (PutLocalVar("input"), ValueError),
+ (GetLocalVar("input"), ValueError),
+ (PutLocalVar("input") | GetLocalVar("missing"), KeyError),
+ ],
+)
+def test_incorrect_usage(runnable: RunnableSequence, error: Type[Exception]) -> None:
+ with pytest.raises(error):
runnable.invoke("foo")
@@ -42,24 +56,24 @@ def test_get_in_map() -> None:
assert runnable.invoke("foo") == {"bar": "foo"}
-def test_cant_put_in_map() -> None:
+def test_put_in_map() -> None:
runnable: RunnableSequence = {"bar": PutLocalVar("input")} | GetLocalVar("input")
with pytest.raises(KeyError):
runnable.invoke("foo")
-def test_get_passthrough_key() -> None:
- runnable = PutLocalVar("input") | GetLocalVar("input", passthrough_key="output")
- assert runnable.invoke("foo") == {"input": "foo", "output": "foo"}
-
-
-def test_multi_step_sequence() -> None:
- prompt = PromptTemplate.from_template("say {foo}")
- runnable = (
- PutLocalVar("foo")
- | {"foo": RunnablePassthrough()}
- | prompt
- | FakeListLLM(responses=["bar"])
- | GetLocalVar("foo", passthrough_key="output")
- )
- assert runnable.invoke("hello") == {"foo": "hello", "output": "bar"}
+@pytest.mark.parametrize(
+ "runnable",
+ [
+ PutLocalVar("input") | GetLocalVar("input", passthrough_key="output"),
+ (
+ PutLocalVar("input")
+ | {"input": RunnablePassthrough()}
+ | PromptTemplate.from_template("say {input}")
+ | FakeListLLM(responses=["hello"])
+ | GetLocalVar("input", passthrough_key="output")
+ ),
+ ],
+)
+def test_put_get_sequence(runnable: RunnableSequence) -> None:
+ assert runnable.invoke("hello") == {"input": "hello", "output": "hello"}
From a9bf409a0900730e88d2f1ffd087c818137fe8df Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Mon, 21 Aug 2023 17:37:07 -0700
Subject: [PATCH 026/143] param
---
.../unit_tests/schema/runnable/test_locals.py | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
index 0430c03c8..ee07c0cfc 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
+++ b/libs/langchain/tests/unit_tests/schema/runnable/test_locals.py
@@ -75,5 +75,19 @@ def test_put_in_map() -> None:
),
],
)
-def test_put_get_sequence(runnable: RunnableSequence) -> None:
- assert runnable.invoke("hello") == {"input": "hello", "output": "hello"}
+@pytest.mark.parametrize(
+ ("method", "input", "output"),
+ [
+ (lambda r, x: r.invoke(x), "hello", {"input": "hello", "output": "hello"}),
+ (lambda r, x: r.batch(x), ["hello"], [{"input": "hello", "output": "hello"}]),
+ (
+ lambda r, x: list(r.stream(x))[0],
+ "hello",
+ {"input": "hello", "output": "hello"},
+ ),
+ ],
+)
+def test_put_get_sequence(
+ runnable: RunnableSequence, method: Callable, input: Any, output: Any
+) -> None:
+ assert method(runnable, input) == output
From 4e7e6bfe0a7bd15c4ccd72ed33fe1b35b47be3ef Mon Sep 17 00:00:00 2001
From: Bagatur
Date: Mon, 21 Aug 2023 18:01:49 -0700
Subject: [PATCH 027/143] revert
---
libs/langchain/langchain/chains/base.py | 23 +++++++++++------
libs/langchain/langchain/chat_models/base.py | 20 +++++++++------
libs/langchain/langchain/llms/base.py | 27 ++++++++++++--------
libs/langchain/langchain/schema/retriever.py | 20 +++++++++------
4 files changed, 56 insertions(+), 34 deletions(-)
diff --git a/libs/langchain/langchain/chains/base.py b/libs/langchain/langchain/chains/base.py
index 8a49784f7..5a21dc6a6 100644
--- a/libs/langchain/langchain/chains/base.py
+++ b/libs/langchain/langchain/chains/base.py
@@ -63,10 +63,13 @@ class Chain(Serializable, Runnable[Dict[str, Any], Dict[str, Any]], ABC):
**kwargs: Any,
) -> Dict[str, Any]:
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
- return self(input, **config_kwargs, **kwargs)
+ return self(
+ input,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
+ )
async def ainvoke(
self,
@@ -79,11 +82,15 @@ class Chain(Serializable, Runnable[Dict[str, Any], Dict[str, Any]], ABC):
return await asyncio.get_running_loop().run_in_executor(
None, partial(self.invoke, input, config, **kwargs)
)
+
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
- return await self.acall(input, **config_kwargs, **kwargs)
+ return await self.acall(
+ input,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
+ )
memory: Optional[BaseMemory] = None
"""Optional memory object. Defaults to None.
diff --git a/libs/langchain/langchain/chat_models/base.py b/libs/langchain/langchain/chat_models/base.py
index d4c582c19..09199e30d 100644
--- a/libs/langchain/langchain/chat_models/base.py
+++ b/libs/langchain/langchain/chat_models/base.py
@@ -105,15 +105,17 @@ class BaseChatModel(BaseLanguageModel[BaseMessageChunk], ABC):
**kwargs: Any,
) -> BaseMessageChunk:
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
return cast(
BaseMessageChunk,
cast(
ChatGeneration,
self.generate_prompt(
- [self._convert_input(input)], stop=stop, **config_kwargs, **kwargs
+ [self._convert_input(input)],
+ stop=stop,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
).generations[0][0],
).message,
)
@@ -133,11 +135,13 @@ class BaseChatModel(BaseLanguageModel[BaseMessageChunk], ABC):
)
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
llm_result = await self.agenerate_prompt(
- [self._convert_input(input)], stop=stop, **config_kwargs, **kwargs
+ [self._convert_input(input)],
+ stop=stop,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
)
return cast(
BaseMessageChunk, cast(ChatGeneration, llm_result.generations[0][0]).message
diff --git a/libs/langchain/langchain/llms/base.py b/libs/langchain/langchain/llms/base.py
index 401fe61d0..a833487ff 100644
--- a/libs/langchain/langchain/llms/base.py
+++ b/libs/langchain/langchain/llms/base.py
@@ -220,13 +220,18 @@ class BaseLLM(BaseLanguageModel[str], ABC):
**kwargs: Any,
) -> str:
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
- result = self.generate_prompt(
- [self._convert_input(input)], stop=stop, **config_kwargs, **kwargs
+ return (
+ self.generate_prompt(
+ [self._convert_input(input)],
+ stop=stop,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
+ )
+ .generations[0][0]
+ .text
)
- return result.generations[0][0].text
async def ainvoke(
self,
@@ -243,11 +248,13 @@ class BaseLLM(BaseLanguageModel[str], ABC):
)
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
llm_result = await self.agenerate_prompt(
- [self._convert_input(input)], stop=stop, **config_kwargs, **kwargs
+ [self._convert_input(input)],
+ stop=stop,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ **kwargs,
)
return llm_result.generations[0][0].text
diff --git a/libs/langchain/langchain/schema/retriever.py b/libs/langchain/langchain/schema/retriever.py
index 55a1acb08..5da50e149 100644
--- a/libs/langchain/langchain/schema/retriever.py
+++ b/libs/langchain/langchain/schema/retriever.py
@@ -108,10 +108,12 @@ class BaseRetriever(Serializable, Runnable[str, List[Document]], ABC):
self, input: str, config: Optional[RunnableConfig] = None
) -> List[Document]:
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
- return self.get_relevant_documents(input, **config_kwargs)
+ return self.get_relevant_documents(
+ input,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ )
async def ainvoke(
self,
@@ -124,10 +126,12 @@ class BaseRetriever(Serializable, Runnable[str, List[Document]], ABC):
return await super().ainvoke(input, config)
config = config or {}
- config_kwargs: Dict = {
- k: config.get(k) for k in ("callbacks", "tags", "metadata")
- }
- return await self.aget_relevant_documents(input, **config_kwargs)
+ return await self.aget_relevant_documents(
+ input,
+ callbacks=config.get("callbacks"),
+ tags=config.get("tags"),
+ metadata=config.get("metadata"),
+ )
@abstractmethod
def _get_relevant_documents(
From 033b874701ac7936e807ae34ae68e829c4a67e6b Mon Sep 17 00:00:00 2001
From: Julien Salinas
Date: Tue, 22 Aug 2023 09:26:37 +0200
Subject: [PATCH 028/143] Remove some deprecated text generation parameters.
---
libs/langchain/langchain/llms/nlpcloud.py | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/libs/langchain/langchain/llms/nlpcloud.py b/libs/langchain/langchain/llms/nlpcloud.py
index 1420595ee..d908e374e 100644
--- a/libs/langchain/langchain/llms/nlpcloud.py
+++ b/libs/langchain/langchain/llms/nlpcloud.py
@@ -28,8 +28,6 @@ class NLPCloud(LLM):
"""Language to use (multilingual addon)"""
temperature: float = 0.7
"""What sampling temperature to use."""
- min_length: int = 1
- """The minimum number of tokens to generate in the completion."""
max_length: int = 256
"""The maximum number of tokens to generate in the completion."""
length_no_input: bool = True
@@ -46,14 +44,8 @@ class NLPCloud(LLM):
"""The number of highest probability tokens to keep for top-k filtering."""
repetition_penalty: float = 1.0
"""Penalizes repeated tokens. 1.0 means no penalty."""
- length_penalty: float = 1.0
- """Exponential penalty to the length."""
- do_sample: bool = True
- """Whether to use sampling (True) or greedy decoding."""
num_beams: int = 1
"""Number of beams for beam search."""
- early_stopping: bool = False
- """Whether to stop beam search at num_beams sentences."""
num_return_sequences: int = 1
"""How many completions to generate for each prompt."""
@@ -91,7 +83,6 @@ class NLPCloud(LLM):
"""Get the default parameters for calling NLPCloud API."""
return {
"temperature": self.temperature,
- "min_length": self.min_length,
"max_length": self.max_length,
"length_no_input": self.length_no_input,
"remove_input": self.remove_input,
@@ -100,10 +91,7 @@ class NLPCloud(LLM):
"top_p": self.top_p,
"top_k": self.top_k,
"repetition_penalty": self.repetition_penalty,
- "length_penalty": self.length_penalty,
- "do_sample": self.do_sample,
"num_beams": self.num_beams,
- "early_stopping": self.early_stopping,
"num_return_sequences": self.num_return_sequences,
}
From 4d0b7bb8e16b51d450da174559171abcdcce0a31 Mon Sep 17 00:00:00 2001
From: Julien Salinas
Date: Tue, 22 Aug 2023 09:28:22 +0200
Subject: [PATCH 029/143] Remove Dolphin and GPT-J from the embeddings docs.
These models are not proposed anymore.
---
.../integrations/text_embedding/nlp_cloud.ipynb | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb
index 6cf97d943..73ae71fe0 100644
--- a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb
+++ b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb
@@ -9,13 +9,9 @@
"\n",
"NLP Cloud is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n",
"\n",
- "The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers several models:\n",
+ "The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers the following model:\n",
"\n",
- "* `paraphrase-multilingual-mpnet-base-v2`: Paraphrase Multilingual MPNet Base V2 is a very fast model based on Sentence Transformers that is perfectly suited for embeddings extraction in more than 50 languages (see the full list here).\n",
- "\n",
- "* `gpt-j`: GPT-J returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower.\n",
- "\n",
- "* `dolphin`: Dolphin returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower. It natively understands the following languages: Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, French, German, Hungarian, Italian, Japanese, Polish, Portuguese, Romanian, Russian, Serbian, Slovenian, Spanish, Swedish, and Ukrainian."
+ "* `paraphrase-multilingual-mpnet-base-v2`: Paraphrase Multilingual MPNet Base V2 is a very fast model based on Sentence Transformers that is perfectly suited for embeddings extraction in more than 50 languages (see the full list here)."
]
},
{
@@ -84,7 +80,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3.11.2 64-bit",
"language": "python",
"name": "python3"
},
@@ -98,7 +94,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.16"
+ "version": "3.11.2"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+ }
}
},
"nbformat": 4,
From 3c4f32c8b8c8ef6ff35e97d4b5014d4f4d2ea084 Mon Sep 17 00:00:00 2001
From: Aashish Saini
<141953346+AashishSainiShorthillsAI@users.noreply.github.com>
Date: Tue, 22 Aug 2023 20:04:05 +0530
Subject: [PATCH 030/143] Replacing Exception type from ValueError to
ImportError (#9588)
I have restructured the code to ensure uniform handling of ImportError.
In place of previously used ValueError, I've adopted the standard
practice of raising ImportError with explanatory messages. This
modification enhances code readability and clarifies that any problems
stem from module importation.
@eyurtsev , @baskaryan
Thanks
---
libs/langchain/langchain/vectorstores/meilisearch.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/langchain/langchain/vectorstores/meilisearch.py b/libs/langchain/langchain/vectorstores/meilisearch.py
index cb9f5d984..313ae6816 100644
--- a/libs/langchain/langchain/vectorstores/meilisearch.py
+++ b/libs/langchain/langchain/vectorstores/meilisearch.py
@@ -21,7 +21,7 @@ def _create_client(
try:
import meilisearch
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import meilisearch python package. "
"Please install it with `pip install meilisearch`."
)
From fba29f203adfe35d14a03511c88661ddd1c76263 Mon Sep 17 00:00:00 2001
From: toddkim95 <42592581+toddkim95@users.noreply.github.com>
Date: Tue, 22 Aug 2023 23:36:24 +0900
Subject: [PATCH 031/143] Add to support polars (#9610)
### Description
Polars is a DataFrame interface on top of an OLAP Query Engine
implemented in Rust.
Polars is faster to read than pandas, so I'm looking forward to seeing
it added to the document loader.
### Dependencies
polars (https://pola-rs.github.io/polars-book/user-guide/)
---------
Co-authored-by: Bagatur
---
.../document_loaders/polars_dataframe.ipynb | 225 ++++++++++++++++++
.../langchain/document_loaders/__init__.py | 2 +
.../langchain/document_loaders/dataframe.py | 39 ++-
.../document_loaders/polars_dataframe.py | 32 +++
.../langchain/document_loaders/xorbits.py | 22 +-
.../document_loaders/test_polars_dataframe.py | 48 ++++
6 files changed, 339 insertions(+), 29 deletions(-)
create mode 100644 docs/extras/integrations/document_loaders/polars_dataframe.ipynb
create mode 100644 libs/langchain/langchain/document_loaders/polars_dataframe.py
create mode 100644 libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py
diff --git a/docs/extras/integrations/document_loaders/polars_dataframe.ipynb b/docs/extras/integrations/document_loaders/polars_dataframe.ipynb
new file mode 100644
index 000000000..52936f165
--- /dev/null
+++ b/docs/extras/integrations/document_loaders/polars_dataframe.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "213a38a2",
+ "metadata": {},
+ "source": [
+ "# Polars DataFrame\n",
+ "\n",
+ "This notebook goes over how to load data from a [polars](https://pola-rs.github.io/polars-book/user-guide/) DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "f6a7a9e4-80d6-486a-b2e3-636c568aa97c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install polars"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "79331964",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "e487044c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pl.read_csv(\"example_data/mlb_teams_2012.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "ac273ca1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
- " \n",
- " \n",
- "\n",
- " \n",
- " \n",
- " \n",
- " "
- ],
- "text/plain": [
- "AtlasProject: <{'id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'owner': '9c29afbb-a002-4d49-958e-ecf5ae1351ac', 'project_name': 'test_index_1677255228.136989', 'creator': 'auth0|63efc4b5462246f4d9a6ecf2', 'description': 'A description for your project', 'opensearch_index_id': 'f61fb8dd-0abf-4f31-9130-41870e443902', 'is_public': True, 'project_fields': ['atlas_id', 'text'], 'unique_id_field': 'atlas_id', 'modality': 'text', 'total_datums_in_project': 508, 'created_timestamp': '2023-02-24T16:13:50.313363+00:00', 'atlas_indices': [{'id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'project_id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'index_name': 'test_index_1677255228.136989_index', 'indexed_field': 'text', 'created_timestamp': '2023-02-24T16:13:52.957101+00:00', 'updated_timestamp': '2023-02-24T16:14:03.469621+00:00', 'atoms': ['charchunk', 'document'], 'colorable_fields': [], 'embedders': [{'id': '7ec0868a-4eed-4414-a482-25cce9803e1b', 'atlas_index_id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'ready': True, 'model_name': 'NomicEmbed', 'hyperparameters': {'norm': 'both', 'batch_size': 20, 'polymerize_by': 'charchunk', 'dataset_buffer_size': 1000}}], 'nearest_neighbor_indices': [{'id': '86f8e3ff-e07c-4678-a4d7-144db4b0301d', 'index_name': 'NomicOrganize', 'ready': True, 'hyperparameters': {'dim': 384, 'space': 'l2'}, 'atom_strategies': ['document']}], 'projections': [{'id': 'db996d77-8981-48a0-897a-ff2c22bbf541', 'projection_name': 'NomicProject', 'ready': True, 'hyperparameters': {'spread': 1.0, 'n_epochs': 50, 'n_neighbors': 15}, 'atom_strategies': ['document'], 'created_timestamp': '2023-02-24T16:13:52.979561+00:00', 'updated_timestamp': '2023-02-24T16:14:03.466309+00:00'}]}], 'insert_update_delete_lock': False}>"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"db.project"
]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here is a map with the result of this code. This map displays the texts of the State of the Union.\n",
+ "https://atlas.nomic.ai/map/3e4de075-89ff-486a-845c-36c23f30bb67/d8ce2284-8edb-4050-8b9b-9bb543d7f647"
+ ]
}
],
"metadata": {
From a9c86774daa784f71b8daf849ae950b8ff5a5a27 Mon Sep 17 00:00:00 2001
From: Joshua Sundance Bailey
<84336755+joshuasundance-swca@users.noreply.github.com>
Date: Wed, 23 Aug 2023 21:23:21 -0400
Subject: [PATCH 096/143] Anthropic: Allow the use of kwargs consistent with
ChatOpenAI. (#9515)
- Description: ~~Creates a new root_validator in `_AnthropicCommon` that
allows the use of `model_name` and `max_tokens` keyword arguments.~~
Adds pydantic field aliases to support `model_name` and `max_tokens` as
keyword arguments. Ultimately, this makes `ChatAnthropic` more
consistent with `ChatOpenAI`, making the two classes more
interchangeable for the developer.
- Issue: https://github.com/langchain-ai/langchain/issues/9510
---------
Co-authored-by: Bagatur
---
libs/langchain/langchain/chat_models/anthropic.py | 6 ++++++
libs/langchain/langchain/llms/anthropic.py | 11 +++++++++--
.../chat_models/test_anthropic_2.py | 12 ++++++++++++
.../tests/integration_tests/llms/test_anthropic.py | 14 +++++++++++++-
.../tests/unit_tests/chat_models/test_openai.py | 12 +++++++++---
5 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/libs/langchain/langchain/chat_models/anthropic.py b/libs/langchain/langchain/chat_models/anthropic.py
index ef1da6319..4d00eae4d 100644
--- a/libs/langchain/langchain/chat_models/anthropic.py
+++ b/libs/langchain/langchain/chat_models/anthropic.py
@@ -36,6 +36,12 @@ class ChatAnthropic(BaseChatModel, _AnthropicCommon):
model = ChatAnthropic(model="", anthropic_api_key="my-api-key")
"""
+ class Config:
+ """Configuration for this pydantic object."""
+
+ allow_population_by_field_name = True
+ arbitrary_types_allowed = True
+
@property
def lc_secrets(self) -> Dict[str, str]:
return {"anthropic_api_key": "ANTHROPIC_API_KEY"}
diff --git a/libs/langchain/langchain/llms/anthropic.py b/libs/langchain/langchain/llms/anthropic.py
index afaea04b5..63664e07a 100644
--- a/libs/langchain/langchain/llms/anthropic.py
+++ b/libs/langchain/langchain/llms/anthropic.py
@@ -21,10 +21,10 @@ from langchain.utils.utils import build_extra_kwargs
class _AnthropicCommon(BaseLanguageModel):
client: Any = None #: :meta private:
async_client: Any = None #: :meta private:
- model: str = "claude-2"
+ model: str = Field(default="claude-2", alias="model_name")
"""Model name to use."""
- max_tokens_to_sample: int = 256
+ max_tokens_to_sample: int = Field(default=256, alias="max_tokens")
"""Denotes the number of tokens to predict per generation."""
temperature: Optional[float] = None
@@ -144,6 +144,7 @@ class Anthropic(LLM, _AnthropicCommon):
import anthropic
from langchain.llms import Anthropic
+
model = Anthropic(model="", anthropic_api_key="my-api-key")
# Simplest invocation, automatically wrapped with HUMAN_PROMPT
@@ -157,6 +158,12 @@ class Anthropic(LLM, _AnthropicCommon):
response = model(prompt)
"""
+ class Config:
+ """Configuration for this pydantic object."""
+
+ allow_population_by_field_name = True
+ arbitrary_types_allowed = True
+
@root_validator()
def raise_warning(cls, values: Dict) -> Dict:
"""Raise warning that this class is deprecated."""
diff --git a/libs/langchain/tests/integration_tests/chat_models/test_anthropic_2.py b/libs/langchain/tests/integration_tests/chat_models/test_anthropic_2.py
index 7447ec03e..54b604527 100644
--- a/libs/langchain/tests/integration_tests/chat_models/test_anthropic_2.py
+++ b/libs/langchain/tests/integration_tests/chat_models/test_anthropic_2.py
@@ -8,6 +8,18 @@ from langchain.chat_models import ChatAnthropic
os.environ["ANTHROPIC_API_KEY"] = "foo"
+@pytest.mark.requires("anthropic")
+def test_anthropic_model_name_param() -> None:
+ llm = ChatAnthropic(model_name="foo")
+ assert llm.model == "foo"
+
+
+@pytest.mark.requires("anthropic")
+def test_anthropic_model_param() -> None:
+ llm = ChatAnthropic(model="foo")
+ assert llm.model == "foo"
+
+
@pytest.mark.requires("anthropic")
def test_anthropic_model_kwargs() -> None:
llm = ChatAnthropic(model_kwargs={"foo": "bar"})
diff --git a/libs/langchain/tests/integration_tests/llms/test_anthropic.py b/libs/langchain/tests/integration_tests/llms/test_anthropic.py
index 3604f6196..f68053b2a 100644
--- a/libs/langchain/tests/integration_tests/llms/test_anthropic.py
+++ b/libs/langchain/tests/integration_tests/llms/test_anthropic.py
@@ -9,6 +9,18 @@ from langchain.schema import LLMResult
from tests.unit_tests.callbacks.fake_callback_handler import FakeCallbackHandler
+@pytest.mark.requires("anthropic")
+def test_anthropic_model_name_param() -> None:
+ llm = Anthropic(model_name="foo")
+ assert llm.model == "foo"
+
+
+@pytest.mark.requires("anthropic")
+def test_anthropic_model_param() -> None:
+ llm = Anthropic(model="foo")
+ assert llm.model == "foo"
+
+
def test_anthropic_call() -> None:
"""Test valid call to anthropic."""
llm = Anthropic(model="claude-instant-1")
@@ -24,7 +36,7 @@ def test_anthropic_streaming() -> None:
assert isinstance(generator, Generator)
for token in generator:
- assert isinstance(token["completion"], str)
+ assert isinstance(token, str)
def test_anthropic_streaming_callback() -> None:
diff --git a/libs/langchain/tests/unit_tests/chat_models/test_openai.py b/libs/langchain/tests/unit_tests/chat_models/test_openai.py
index b417d82e9..c23372472 100644
--- a/libs/langchain/tests/unit_tests/chat_models/test_openai.py
+++ b/libs/langchain/tests/unit_tests/chat_models/test_openai.py
@@ -6,9 +6,7 @@ from unittest.mock import MagicMock, patch
import pytest
from langchain.adapters.openai import convert_dict_to_message
-from langchain.chat_models.openai import (
- ChatOpenAI,
-)
+from langchain.chat_models.openai import ChatOpenAI
from langchain.schema.messages import (
AIMessage,
FunctionMessage,
@@ -17,6 +15,14 @@ from langchain.schema.messages import (
)
+@pytest.mark.requires("openai")
+def test_openai_model_param() -> None:
+ llm = ChatOpenAI(model="foo")
+ assert llm.model_name == "foo"
+ llm = ChatOpenAI(model_name="foo")
+ assert llm.model_name == "foo"
+
+
def test_function_message_dict_to_function_message() -> None:
content = json.dumps({"result": "Example #1"})
name = "test_function"
From c215481531a8f36b6c196b213db8d4ec2f0c5306 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=20=E6=96=B9=E7=91=9E?=
Date: Thu, 24 Aug 2023 09:26:29 +0800
Subject: [PATCH 097/143] Update default index type and metric type for MyScale
vector store (#9353)
We update the default index type from `IVFFLAT` to `MSTG`, a new vector
type developed by MyScale.
---
libs/langchain/langchain/vectorstores/myscale.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libs/langchain/langchain/vectorstores/myscale.py b/libs/langchain/langchain/vectorstores/myscale.py
index 7bbb20dcf..ef4db34ea 100644
--- a/libs/langchain/langchain/vectorstores/myscale.py
+++ b/libs/langchain/langchain/vectorstores/myscale.py
@@ -46,7 +46,7 @@ class MyScaleSettings(BaseSettings):
table (str) : Table name to operate on.
Defaults to 'vector_table'.
metric (str) : Metric to compute distance,
- supported are ('l2', 'cosine', 'ip'). Defaults to 'cosine'.
+ supported are ('L2', 'Cosine', 'IP'). Defaults to 'Cosine'.
column_map (Dict) : Column type map to project column name onto langchain
semantics. Must have keys: `text`, `id`, `vector`,
must be same size to number of columns. For example:
@@ -69,7 +69,7 @@ class MyScaleSettings(BaseSettings):
username: Optional[str] = None
password: Optional[str] = None
- index_type: str = "IVFFLAT"
+ index_type: str = "MSTG"
index_param: Optional[Dict[str, str]] = None
column_map: Dict[str, str] = {
@@ -81,7 +81,7 @@ class MyScaleSettings(BaseSettings):
database: str = "default"
table: str = "langchain"
- metric: str = "cosine"
+ metric: str = "Cosine"
def __getitem__(self, item: str) -> Any:
return getattr(self, item)
From b88dfcb42a52eab78ccdcd38c1f1833c2acf3caf Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev
Date: Wed, 23 Aug 2023 21:41:38 -0400
Subject: [PATCH 098/143] Add indexing support (#9614)
This PR introduces a persistence layer to help with indexing workflows
into
vectostores.
The indexing code helps users to:
1. Avoid writing duplicated content into the vectostore
2. Avoid over-writing content if it's unchanged
Importantly, this keeps on working even if the content being written is
derived
via a set of transformations from some source content (e.g., indexing
children
documents that were derived from parent documents by chunking.)
The two main components are:
1. Persistence layer that keeps track of which keys were updated and
when.
Keeping track of the timestamp of updates, allows to clean up old
content
safely, and with minimal complexity.
2. HashedDocument which is used to hash the contents (including
metadata) of
the documents. We rely on the hashes for identifying duplicates.
The indexing code works with **ANY** document loader. To add
transformations
to the documents, users for now can add a custom document loader
that composes an existing loader together with document transformers.
---------
Co-authored-by: Bagatur
---
.../modules/data_connection/indexing.ipynb | 916 ++++++++++++++++++
libs/langchain/langchain/indexes/__init__.py | 27 +-
libs/langchain/langchain/indexes/_api.py | 346 +++++++
.../langchain/indexes/_sql_record_manager.py | 265 +++++
libs/langchain/langchain/indexes/base.py | 95 ++
.../tests/unit_tests/indexes/test_api.py | 13 +
.../indexes/test_hashed_document.py | 50 +
.../tests/unit_tests/indexes/test_indexing.py | 474 +++++++++
.../indexes/test_sql_record_manager.py | 276 ++++++
9 files changed, 2460 insertions(+), 2 deletions(-)
create mode 100644 docs/extras/modules/data_connection/indexing.ipynb
create mode 100644 libs/langchain/langchain/indexes/_api.py
create mode 100644 libs/langchain/langchain/indexes/_sql_record_manager.py
create mode 100644 libs/langchain/langchain/indexes/base.py
create mode 100644 libs/langchain/tests/unit_tests/indexes/test_api.py
create mode 100644 libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
create mode 100644 libs/langchain/tests/unit_tests/indexes/test_indexing.py
create mode 100644 libs/langchain/tests/unit_tests/indexes/test_sql_record_manager.py
diff --git a/docs/extras/modules/data_connection/indexing.ipynb b/docs/extras/modules/data_connection/indexing.ipynb
new file mode 100644
index 000000000..28914efaa
--- /dev/null
+++ b/docs/extras/modules/data_connection/indexing.ipynb
@@ -0,0 +1,916 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0fe57ac5-31c5-4dbb-b96c-78dead32e1bd",
+ "metadata": {},
+ "source": [
+ "# Indexing\n",
+ "\n",
+ "Here, we will look at a basic indexing workflow using the LangChain indexing API. \n",
+ "\n",
+ "The indexing API lets you load and keep in sync documents from any source into a vector store. Specifically, it helps:\n",
+ "\n",
+ "* Avoid writing duplicated content into the vector store\n",
+ "* Avoid re-writing unchanged content\n",
+ "* Avoid re-computing embeddings over unchanged content\n",
+ "\n",
+ "All of which should save you time and money, as well as improve your vector search results.\n",
+ "\n",
+ "Crucially, the indexing API will work even with documents that have gone through several \n",
+ "transformation steps (e.g., via text chunking) with respect to the original source documents.\n",
+ "\n",
+ "## How it works\n",
+ "\n",
+ "LangChain indexing makes use of a record manager (`RecordManager`) that keeps track of document writes into the vector store.\n",
+ "\n",
+ "When indexing content, hashes are computed for each document, and the following information is stored in the record manager: \n",
+ "\n",
+ "- the document hash (hash of both page content and metadata)\n",
+ "- write time\n",
+ "- the source id -- each document should include information in its metadata to allow us to determine the ultimate source of this document\n",
+ "\n",
+ "## Deletion modes\n",
+ "\n",
+ "When indexing documents into a vector store, it's possible that some existing documents in the vector store should be deleted. In certain situations you may want to remove any existing documents that are derived from the same sources as the new documents being indexed. In others you may want to delete all existing documents wholesale. The indexing API deletion modes let you pick the behavior you want:\n",
+ "\n",
+ "| Delete Mode | De-Duplicates Content | Parallelizable | Cleans Up Deleted Source Docs | Cleans Up Mutations of Source Docs and/or Derived Docs | Clean Up Timing |\n",
+ "|-------------|-----------------------|---------------|----------------------------------|----------------------------------------------------|---------------------|\n",
+ "| None | ✅ | ✅ | ❌ | ❌ | - |\n",
+ "| Incremental | ✅ | ✅ | ❌ | ✅ | Continuously |\n",
+ "| Full | ✅ | ❌ | ✅ | ✅ | At end of indexing |\n",
+ "\n",
+ "\n",
+ "`None` does not do any automatic clean up, allowing the user to manually do clean up of old content. \n",
+ "\n",
+ "`incremental` and `full` offer the following automated clean up:\n",
+ "\n",
+ "* If the content of source document or derived documents has **changed**, both `incremental` or `full` modes will clean up (delete) previous versions of the content.\n",
+ "* If the source document has been **deleted** (meaning it is not included in the documents currently being indexed), the `full` delete mode will delete it from the vector store correctly, but the `incremental` mode will not.\n",
+ "\n",
+ "When content is mutated (e.g., the source PDF file was revised) there will be a period of time during indexing when both the new and old versions may be returned to the user. This happens after the new content was written, but before the old version was deleted.\n",
+ "\n",
+ "* `incremental` indexing minimizes this period of time as it is able to do clean up continuously, as it writes.\n",
+ "* `full` mode does the clean up after all batches have been written.\n",
+ "\n",
+ "## Requirements\n",
+ "\n",
+ "1. Do not use with a store that has been pre-populated with content independently of the indexing API, as the record manager will not know that records have been inserted previously.\n",
+ "2. Only works with LangChain ``VectorStore``'s that support:\n",
+ " * document addition by id (`add_documents` method with `ids` argument)\n",
+ " * delete by id (`delete` method with)\n",
+ " \n",
+ "## Caution\n",
+ "\n",
+ "The record manager relies on a time-based mechanism to determine what content can be cleaned up (when using `full` or `incremental` delete modes).\n",
+ "\n",
+ "If two tasks run back to back, and the first task finishes before the the clock time changes, then the second task may not be able to clean up content.\n",
+ "\n",
+ "This is unlikely to be an issue in actual settings for the following reasons:\n",
+ "\n",
+ "1. The RecordManager uses higher resolutino timestamps.\n",
+ "2. The data would need to change between the first and the second tasks runs, which becomes unlikely if the time interval between the tasks is small.\n",
+ "3. Indexing tasks typically take more than a few ms."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec2109b4-cbcc-44eb-9dac-3f7345f971dc",
+ "metadata": {},
+ "source": [
+ "## Quickstart"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "15f7263e-c82e-4914-874f-9699ea4de93e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.embeddings import OpenAIEmbeddings\n",
+ "from langchain.indexes import SQLRecordManager, index\n",
+ "from langchain.schema import Document\n",
+ "from langchain.vectorstores import ElasticsearchStore"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f81201ab-d997-433c-9f18-ceea70e61cbd",
+ "metadata": {},
+ "source": [
+ "Initialize a vector store and set up the embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "4ffc9659-91c0-41e0-ae4b-f7ff0d97292d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "collection_name = \"test_index\"\n",
+ "\n",
+ "embedding = OpenAIEmbeddings()\n",
+ "\n",
+ "vectorstore = ElasticsearchStore(\n",
+ " es_url=\"http://localhost:9200\", index_name=\"test_index\", embedding=embedding\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9b7564f-2334-428b-b513-13045a08b56c",
+ "metadata": {},
+ "source": [
+ "Initialize a record manager with an appropriate namespace.\n",
+ "\n",
+ "**Suggestion** Use a namespace that takes into account both the vectostore and the collection name in the vectorstore; e.g., 'redis/my_docs', 'chromadb/my_docs' or 'postgres/my_docs'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "498cc80e-c339-49ee-893b-b18d06346ef8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "namespace = f\"elasticsearch/{collection_name}\"\n",
+ "record_manager = SQLRecordManager(\n",
+ " namespace, db_url=\"sqlite:///record_manager_cache.sql\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "835c2c19-68ec-4086-9066-f7ba40877fd5",
+ "metadata": {},
+ "source": [
+ "Create a schema before using the record manager"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a4be2da3-3a5c-468a-a824-560157290f7f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "record_manager.create_schema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f07c6bd-6ada-4b17-a8c5-fe5e4a5278fd",
+ "metadata": {},
+ "source": [
+ "Let's index some test documents"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "bbfdf314-14f9-4799-8fb6-d42de4d51287",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "doc1 = Document(page_content=\"kitty\", metadata={\"source\": \"kitty.txt\"})\n",
+ "doc2 = Document(page_content=\"doggy\", metadata={\"source\": \"doggy.txt\"})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7d572be-a913-4511-ab64-2864a252458a",
+ "metadata": {},
+ "source": [
+ "Indexing into an empty vectorstore"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "67d2a5c8-f2bd-489a-b58e-2c7ba7fefe6f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _clear():\n",
+ " \"\"\"Hacky helper method to clear content. See the `full` mode section to to understand why it works.\"\"\"\n",
+ " index([], record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5e92e76-f23f-4a61-8a2d-f16baf288700",
+ "metadata": {},
+ "source": [
+ "### ``None`` deletion mode\n",
+ "\n",
+ "This mode does not do automatic clean up of old versions of content; however, it still takes care of content de-duplication."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e2288cee-1738-4054-af72-23b5c5be8840",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "b253483b-5be0-4151-b732-ca93db4457b1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [doc1, doc1, doc1, doc1, doc1],\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=None,\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "7abaf351-bf5a-4d9e-95cd-4e3ecbfc1a84",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "55b6873c-5907-4fa6-84ca-df6cdf1810f0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7be3e55a-5fe9-4f40-beff-577c2aa5e76a",
+ "metadata": {},
+ "source": [
+ "Second time around all content will be skipped"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "59d74ca1-2e3d-4b4c-ad88-a4907aa20081",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "237a809e-575d-4f02-870e-5906a3643f30",
+ "metadata": {},
+ "source": [
+ "### ``\"incremental\"`` deletion mode"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "6bc91073-0ab4-465a-9302-e7f4bbd2285c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "4a551091-6d46-4cdd-9af9-8672e5866a0a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [doc1, doc2],\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=\"incremental\",\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d0604ab8-318c-4706-959b-3907af438630",
+ "metadata": {},
+ "source": [
+ "Indexing again should result in both documents getting **skipped** -- also skipping the embedding operation!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "81785863-391b-4578-a6f6-63b3e5285488",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [doc1, doc2],\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=\"incremental\",\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b205c1ba-f069-4a4e-af93-dc98afd5c9e6",
+ "metadata": {},
+ "source": [
+ "If we provide no documents with incremental indexing mode, nothing will change"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "1f73ca85-7478-48ab-976c-17b00beec7bd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [], record_manager, vectorstore, delete_mode=\"incremental\", source_id_key=\"source\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8c4ac96-8d60-4ade-8a94-e76ccb536442",
+ "metadata": {},
+ "source": [
+ "If we mutate a document, the new version will be written and all old versions sharing the same source will be deleted."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "27d05bcb-d96d-42eb-88a8-54b33d6cfcdc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "changed_doc_2 = Document(page_content=\"puppy\", metadata={\"source\": \"doggy.txt\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "3809e379-5962-4267-add9-b10f43e24c66",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 1}"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " [changed_doc_2],\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=\"incremental\",\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8bc75b9c-784a-4eb6-b5d6-688e3fbd4658",
+ "metadata": {},
+ "source": [
+ "### ``\"full\"`` deletion mode\n",
+ "\n",
+ "In `full` mode the user should pass the `full` universe of content that should be indexed into the indexing function.\n",
+ "\n",
+ "Any documents that are not passed into the indexing functino and are present in the vectorstore will be deleted!\n",
+ "\n",
+ "This behavior is useful to handle deletions of source documents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "38a14a3d-11c7-43e2-b7f1-08e487961bb5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "46b5d7b6-ce91-47d2-a9d0-f390e77d847f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "all_docs = [doc1, doc2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "06954765-6155-40a0-b95e-33ef87754c8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "887c45c6-4363-4389-ac56-9cdad682b4c8",
+ "metadata": {},
+ "source": [
+ "Say someone deleted the first doc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "35270e4e-9b03-4486-95de-e819ca5e469f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "del all_docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7d835a6a-f468-4d79-9a3d-47db187edbb8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='doggy', metadata={'source': 'doggy.txt'})]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "all_docs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d940bcb4-cf6d-4c21-a565-e7f53f6dacf1",
+ "metadata": {},
+ "source": [
+ "Using full mode will clean up the deleted content as well"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "1b660eae-3bed-434d-a6f5-2aec96e5f0d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 0, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a7ecdc9-df3c-4601-b2f3-50fdffc6e5f9",
+ "metadata": {},
+ "source": [
+ "## Source "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4002a4ac-02dd-4599-9b23-9b59f54237c8",
+ "metadata": {},
+ "source": [
+ "The metadata attribute contains a filed called `source`. This source should be pointing at the *ultimate* provenance associated with the given document.\n",
+ "\n",
+ "For example, if these documents are representing chunks of some parent document, the `source` for both documents should be the same and reference the parent document.\n",
+ "\n",
+ "In general, `source` should always be specified. Only use a `None`, if you **never** intend to use `incremental` mode, and for some reason can't specify the `source` field correctly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "184d3051-7fd1-4db2-a1d5-218ac0e1e641",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.text_splitter import CharacterTextSplitter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "11318248-ad2a-4ef0-bd9b-9d4dab97caba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "doc1 = Document(\n",
+ " page_content=\"kitty kitty kitty kitty kitty\", metadata={\"source\": \"kitty.txt\"}\n",
+ ")\n",
+ "doc2 = Document(page_content=\"doggy doggy the doggy\", metadata={\"source\": \"doggy.txt\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "2cbf0902-d17b-44c9-8983-e8d0e831f909",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='kitty kit', metadata={'source': 'kitty.txt'}),\n",
+ " Document(page_content='tty kitty ki', metadata={'source': 'kitty.txt'}),\n",
+ " Document(page_content='tty kitty', metadata={'source': 'kitty.txt'}),\n",
+ " Document(page_content='doggy doggy', metadata={'source': 'doggy.txt'}),\n",
+ " Document(page_content='the doggy', metadata={'source': 'doggy.txt'})]"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_docs = CharacterTextSplitter(\n",
+ " separator=\"t\", keep_separator=True, chunk_size=12, chunk_overlap=2\n",
+ ").split_documents([doc1, doc2])\n",
+ "new_docs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "0f9d9bc2-ea85-48ab-b4a2-351c8708b1d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "58781d81-f273-4aeb-8df6-540236826d00",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 5, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " new_docs,\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=\"incremental\",\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "11b81cb6-5f04-499b-b125-1abb22d353bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "changed_doggy_docs = [\n",
+ " Document(page_content=\"woof woof\", metadata={\"source\": \"doggy.txt\"}),\n",
+ " Document(page_content=\"woof woof woof\", metadata={\"source\": \"doggy.txt\"}),\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab1c0915-3f9e-42ac-bdb5-3017935c6e7f",
+ "metadata": {},
+ "source": [
+ "This should delete the old versions of documents associated with `doggy.txt` source and replace them with the new versions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "fec71cb5-6757-4b92-a306-62509f6e867d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 2}"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(\n",
+ " changed_doggy_docs,\n",
+ " record_manager,\n",
+ " vectorstore,\n",
+ " delete_mode=\"incremental\",\n",
+ " source_id_key=\"source\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "876f5ab6-4b25-423e-8cff-f5a7a014395b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='tty kitty', metadata={'source': 'kitty.txt'}),\n",
+ " Document(page_content='tty kitty ki', metadata={'source': 'kitty.txt'}),\n",
+ " Document(page_content='kitty kit', metadata={'source': 'kitty.txt'})]"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vectorstore.similarity_search(\"dog\", k=30)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c0af4d24-d735-4e5d-ad9b-a2e8b281f9f1",
+ "metadata": {},
+ "source": [
+ "## Using with Loaders\n",
+ "\n",
+ "Indexing can accept either an iterable of documents or else any loader.\n",
+ "\n",
+ "**Attention** The loader **MUST** set source keys correctly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "08b68357-27c0-4f07-a51d-61c986aeb359",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders.base import BaseLoader\n",
+ "\n",
+ "\n",
+ "class MyCustomLoader(BaseLoader):\n",
+ " def lazy_load(self):\n",
+ " text_splitter = CharacterTextSplitter(\n",
+ " separator=\"t\", keep_separator=True, chunk_size=12, chunk_overlap=2\n",
+ " )\n",
+ " docs = [\n",
+ " Document(page_content=\"woof woof\", metadata={\"source\": \"doggy.txt\"}),\n",
+ " Document(page_content=\"woof woof woof\", metadata={\"source\": \"doggy.txt\"}),\n",
+ " ]\n",
+ " yield from text_splitter.split_documents(docs)\n",
+ "\n",
+ " def load(self):\n",
+ " return list(self.lazy_load())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "5dae8e11-c0d6-4fc6-aa0e-68f8d92b5087",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "d8d72f76-6d6e-4a7c-8fea-9bdec05af05b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = MyCustomLoader()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "945c45cc-5a8d-4bd7-9f36-4ebd4a50e08b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='woof woof', metadata={'source': 'doggy.txt'}),\n",
+ " Document(page_content='woof woof woof', metadata={'source': 'doggy.txt'})]"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "dcb1ba71-db49-4140-ab4a-c5d64fc2578a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index(loader, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "441159c1-dd84-48d7-8599-37a65c9fb589",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='woof woof', metadata={'source': 'doggy.txt'}),\n",
+ " Document(page_content='woof woof woof', metadata={'source': 'doggy.txt'})]"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vectorstore.similarity_search(\"dog\", k=30)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/langchain/langchain/indexes/__init__.py b/libs/langchain/langchain/indexes/__init__.py
index d0c878915..792cf30c8 100644
--- a/libs/langchain/langchain/indexes/__init__.py
+++ b/libs/langchain/langchain/indexes/__init__.py
@@ -1,5 +1,28 @@
-"""**Index** utilities."""
+"""Code to support various indexing workflows.
+
+Provides code to:
+
+* Create knowledge graphs from data.
+
+* Support indexing workflows from LangChain data loaders to vectorstores.
+
+For indexing workflows, this code is used to avoid writing duplicated content
+into the vectostore and to avoid over-writing content if it's unchanged.
+
+Importantly, this keeps on working even if the content being written is derived
+via a set of transformations from some source content (e.g., indexing children
+documents that were derived from parent documents by chunking.)
+"""
+from langchain.indexes._api import IndexingResult, index
+from langchain.indexes._sql_record_manager import SQLRecordManager
from langchain.indexes.graph import GraphIndexCreator
from langchain.indexes.vectorstore import VectorstoreIndexCreator
-__all__ = ["GraphIndexCreator", "VectorstoreIndexCreator"]
+__all__ = [
+ # Keep sorted
+ "GraphIndexCreator",
+ "index",
+ "IndexingResult",
+ "SQLRecordManager",
+ "VectorstoreIndexCreator",
+]
diff --git a/libs/langchain/langchain/indexes/_api.py b/libs/langchain/langchain/indexes/_api.py
new file mode 100644
index 000000000..47b9d33ea
--- /dev/null
+++ b/libs/langchain/langchain/indexes/_api.py
@@ -0,0 +1,346 @@
+"""Module contains logic for indexing documents into vector stores."""
+from __future__ import annotations
+
+import hashlib
+import json
+import uuid
+from itertools import islice
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ Literal,
+ Optional,
+ Sequence,
+ TypedDict,
+ TypeVar,
+ Union,
+ cast,
+)
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.indexes.base import NAMESPACE_UUID, RecordManager
+from langchain.pydantic_v1 import root_validator
+from langchain.schema import Document
+from langchain.vectorstores.base import VectorStore
+
+T = TypeVar("T")
+
+
+def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
+ """Hashes a string and returns the corresponding UUID."""
+ hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
+ return uuid.uuid5(NAMESPACE_UUID, hash_value)
+
+
+def _hash_nested_dict_to_uuid(data: dict) -> uuid.UUID:
+ """Hashes a nested dictionary and returns the corresponding UUID."""
+ serialized_data = json.dumps(data, sort_keys=True)
+ hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
+ return uuid.uuid5(NAMESPACE_UUID, hash_value)
+
+
+class _HashedDocument(Document):
+ """A hashed document with a unique ID."""
+
+ uid: str
+ hash_: str
+ """The hash of the document including content and metadata."""
+ content_hash: str
+ """The hash of the document content."""
+ metadata_hash: str
+ """The hash of the document metadata."""
+
+ @root_validator(pre=True)
+ def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+ """Root validator to calculate content and metadata hash."""
+ content = values.get("page_content", "")
+ metadata = values.get("metadata", {})
+
+ forbidden_keys = ("hash_", "content_hash", "metadata_hash")
+
+ for key in forbidden_keys:
+ if key in metadata:
+ raise ValueError(
+ f"Metadata cannot contain key {key} as it "
+ f"is reserved for internal use."
+ )
+
+ content_hash = str(_hash_string_to_uuid(content))
+
+ try:
+ metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
+ except Exception as e:
+ raise ValueError(
+ f"Failed to hash metadata: {e}. "
+ f"Please use a dict that can be serialized using json."
+ )
+
+ values["content_hash"] = content_hash
+ values["metadata_hash"] = metadata_hash
+ values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
+
+ _uid = values.get("uid", None)
+
+ if _uid is None:
+ values["uid"] = values["hash_"]
+ return values
+
+ def to_document(self) -> Document:
+ """Return a Document object."""
+ return Document(
+ page_content=self.page_content,
+ metadata=self.metadata,
+ )
+
+ @classmethod
+ def from_document(
+ cls, document: Document, *, uid: Optional[str] = None
+ ) -> _HashedDocument:
+ """Create a HashedDocument from a Document."""
+ return cls(
+ uid=uid,
+ page_content=document.page_content,
+ metadata=document.metadata,
+ )
+
+
+def _batch(size: int, iterable: Iterable[T]) -> Iterator[List[T]]:
+ """Utility batching function."""
+ it = iter(iterable)
+ while True:
+ chunk = list(islice(it, size))
+ if not chunk:
+ return
+ yield chunk
+
+
+def _get_source_id_assigner(
+ source_id_key: Union[str, Callable[[Document], str], None],
+) -> Callable[[Document], Union[str, None]]:
+ """Get the source id from the document."""
+ if source_id_key is None:
+ return lambda doc: None
+ elif isinstance(source_id_key, str):
+ return lambda doc: doc.metadata[source_id_key]
+ elif callable(source_id_key):
+ return source_id_key
+ else:
+ raise ValueError(
+ f"source_id_key should be either None, a string or a callable. "
+ f"Got {source_id_key} of type {type(source_id_key)}."
+ )
+
+
+def _deduplicate_in_order(
+ hashed_documents: Iterable[_HashedDocument],
+) -> Iterator[_HashedDocument]:
+ """Deduplicate a list of hashed documents while preserving order."""
+ seen = set()
+
+ for hashed_doc in hashed_documents:
+ if hashed_doc.hash_ not in seen:
+ seen.add(hashed_doc.hash_)
+ yield hashed_doc
+
+
+# PUBLIC API
+
+
+class IndexingResult(TypedDict):
+ """Return a detailed a breakdown of the result of the indexing operation."""
+
+ num_added: int
+ """Number of added documents."""
+ num_updated: int
+ """Number of updated documents because they were not up to date."""
+ num_deleted: int
+ """Number of deleted documents."""
+ num_skipped: int
+ """Number of skipped documents because they were already up to date."""
+
+
+def index(
+ docs_source: Union[BaseLoader, Iterable[Document]],
+ record_manager: RecordManager,
+ vector_store: VectorStore,
+ *,
+ batch_size: int = 100,
+ delete_mode: Literal["incremental", "full", None] = None,
+ source_id_key: Union[str, Callable[[Document], str], None] = None,
+) -> IndexingResult:
+ """Index data from the loader into the vector store.
+
+ Indexing functionality uses a manager to keep track of which documents
+ are in the vector store.
+
+ This allows us to keep track of which documents were updated, and which
+ documents were deleted, which documents should be skipped.
+
+ For the time being, documents are indexed using their hashes, and users
+ are not able to specify the uid of the document.
+
+ IMPORTANT:
+ if auto_cleanup is set to True, the loader should be returning
+ the entire dataset, and not just a subset of the dataset.
+ Otherwise, the auto_cleanup will remove documents that it is not
+ supposed to.
+
+ Args:
+ docs_source: Data loader or iterable of documents to index.
+ record_manager: Timestamped set to keep track of which documents were
+ updated.
+ vector_store: Vector store to index the documents into.
+ batch_size: Batch size to use when indexing.
+ delete_mode: How to handle clean up of documents.
+ - Incremental: Cleans up all documents that haven't been updated AND
+ that are associated with source ids that were seen
+ during indexing.
+ Clean up is done continuously during indexing helping
+ to minimize the probability of users seeing duplicated
+ content.
+ - Full: Delete all documents that haven to been returned by the loader.
+ Clean up runs after all documents have been indexed.
+ This means that users may see duplicated content during indexing.
+ - None: Do not delete any documents.
+ source_id_key: Optional key that helps identify the original source
+ of the document.
+
+ Returns:
+ Indexing result which contains information about how many documents
+ were added, updated, deleted, or skipped.
+ """
+ if delete_mode not in {"incremental", "full", None}:
+ raise ValueError(
+ f"delete_mode should be one of 'incremental', 'full' or None. "
+ f"Got {delete_mode}."
+ )
+
+ if delete_mode == "incremental" and source_id_key is None:
+ raise ValueError("Source id key is required when delete mode is incremental.")
+
+ # Check that the Vectorstore has required methods implemented
+ methods = ["delete", "add_documents"]
+
+ for method in methods:
+ if not hasattr(vector_store, method):
+ raise ValueError(
+ f"Vectorstore {vector_store} does not have required method {method}"
+ )
+
+ if type(vector_store).delete == VectorStore.delete:
+ # Checking if the vectorstore has overridden the default delete method
+ # implementation which just raises a NotImplementedError
+ raise ValueError("Vectorstore has not implemented the delete method")
+
+ if isinstance(docs_source, BaseLoader):
+ try:
+ doc_iterator = docs_source.lazy_load()
+ except NotImplementedError:
+ doc_iterator = iter(docs_source.load())
+ else:
+ doc_iterator = iter(docs_source)
+
+ source_id_assigner = _get_source_id_assigner(source_id_key)
+
+ # Mark when the update started.
+ index_start_dt = record_manager.get_time()
+ num_added = 0
+ num_skipped = 0
+ num_updated = 0
+ num_deleted = 0
+
+ for doc_batch in _batch(batch_size, doc_iterator):
+ hashed_docs = list(
+ _deduplicate_in_order(
+ [_HashedDocument.from_document(doc) for doc in doc_batch]
+ )
+ )
+
+ source_ids: Sequence[Optional[str]] = [
+ source_id_assigner(doc) for doc in hashed_docs
+ ]
+
+ if delete_mode == "incremental":
+ # If the delete mode is incremental, source ids are required.
+ for source_id, hashed_doc in zip(source_ids, hashed_docs):
+ if source_id is None:
+ raise ValueError(
+ "Source ids are required when delete mode is incremental. "
+ f"Document that starts with "
+ f"content: {hashed_doc.page_content[:100]} was not assigned "
+ f"as source id."
+ )
+ # source ids cannot be None after for loop above.
+ source_ids = cast(Sequence[str], source_ids) # type: ignore[assignment]
+
+ exists_batch = record_manager.exists([doc.uid for doc in hashed_docs])
+
+ # Filter out documents that already exist in the record store.
+ uids = []
+ docs_to_index = []
+ for doc, hashed_doc, doc_exists in zip(doc_batch, hashed_docs, exists_batch):
+ if doc_exists:
+ # Must be updated to refresh timestamp.
+ record_manager.update([hashed_doc.uid], time_at_least=index_start_dt)
+ num_skipped += 1
+ continue
+ uids.append(hashed_doc.uid)
+ docs_to_index.append(doc)
+
+ # Be pessimistic and assume that all vector store write will fail.
+ # First write to vector store
+ if docs_to_index:
+ vector_store.add_documents(docs_to_index, ids=uids)
+ num_added += len(docs_to_index)
+
+ # And only then update the record store.
+ # Update ALL records, even if they already exist since we want to refresh
+ # their timestamp.
+ record_manager.update(
+ [doc.uid for doc in hashed_docs],
+ group_ids=source_ids,
+ time_at_least=index_start_dt,
+ )
+
+ # If source IDs are provided, we can do the deletion incrementally!
+ if delete_mode == "incremental":
+ # Get the uids of the documents that were not returned by the loader.
+
+ # mypy isn't good enough to determine that source ids cannot be None
+ # here due to a check that's happening above, so we check again.
+ for source_id in source_ids:
+ if source_id is None:
+ raise AssertionError("Source ids cannot be None here.")
+
+ _source_ids = cast(Sequence[str], source_ids)
+
+ uids_to_delete = record_manager.list_keys(
+ group_ids=_source_ids, before=index_start_dt
+ )
+ if uids_to_delete:
+ # Then delete from vector store.
+ vector_store.delete(uids_to_delete)
+ # First delete from record store.
+ record_manager.delete_keys(uids_to_delete)
+ num_deleted += len(uids_to_delete)
+
+ if delete_mode == "full":
+ uids_to_delete = record_manager.list_keys(before=index_start_dt)
+
+ if uids_to_delete:
+ # Then delete from vector store.
+ vector_store.delete(uids_to_delete)
+ # First delete from record store.
+ record_manager.delete_keys(uids_to_delete)
+ num_deleted = len(uids_to_delete)
+
+ return {
+ "num_added": num_added,
+ "num_updated": num_updated,
+ "num_skipped": num_skipped,
+ "num_deleted": num_deleted,
+ }
diff --git a/libs/langchain/langchain/indexes/_sql_record_manager.py b/libs/langchain/langchain/indexes/_sql_record_manager.py
new file mode 100644
index 000000000..9cad02ef9
--- /dev/null
+++ b/libs/langchain/langchain/indexes/_sql_record_manager.py
@@ -0,0 +1,265 @@
+"""Implementation of a record management layer in SQLAlchemy.
+
+The management layer uses SQLAlchemy to track upserted records.
+
+Currently, this layer only works with SQLite; hopwever, should be adaptable
+to other SQL implementations with minimal effort.
+
+Currently, includes an implementation that uses SQLAlchemy which should
+allow it to work with a variety of SQL as a backend.
+
+* Each key is associated with an updated_at field.
+* This filed is updated whenever the key is updated.
+* Keys can be listed based on the updated at field.
+* Keys can be deleted.
+"""
+import contextlib
+import uuid
+from typing import Any, Dict, Generator, List, Optional, Sequence
+
+from sqlalchemy import (
+ Column,
+ Engine,
+ Float,
+ Index,
+ String,
+ UniqueConstraint,
+ and_,
+ create_engine,
+ text,
+)
+from sqlalchemy.dialects.sqlite import insert
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import Session, sessionmaker
+
+from langchain.indexes.base import RecordManager
+
+Base = declarative_base()
+
+
+class UpsertionRecord(Base): # type: ignore[valid-type,misc]
+ """Table used to keep track of when a key was last updated."""
+
+ # ATTENTION:
+ # Prior to modifying this table, please determine whether
+ # we should create migrations for this table to make sure
+ # users do not experience data loss.
+ __tablename__ = "upsertion_record"
+
+ uuid = Column(
+ String,
+ index=True,
+ default=lambda: str(uuid.uuid4()),
+ primary_key=True,
+ nullable=False,
+ )
+ key = Column(String, index=True)
+ # Using a non-normalized representation to handle `namespace` attribute.
+ # If the need arises, this attribute can be pulled into a separate Collection
+ # table at some time later.
+ namespace = Column(String, index=True, nullable=False)
+ group_id = Column(String, index=True, nullable=True)
+
+ # The timestamp associated with the last record upsertion.
+ updated_at = Column(Float, index=True)
+
+ __table_args__ = (
+ UniqueConstraint("key", "namespace", name="uix_key_namespace"),
+ Index("ix_key_namespace", "key", "namespace"),
+ )
+
+
+class SQLRecordManager(RecordManager):
+ """A SQL Alchemy based implementation of the record manager."""
+
+ def __init__(
+ self,
+ namespace: str,
+ *,
+ engine: Optional[Engine] = None,
+ db_url: Optional[str] = None,
+ engine_kwargs: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ """Initialize the SQLRecordManager.
+
+ This class serves as a manager persistence layer that uses an SQL
+ backend to track upserted records. You should specify either a db_url
+ to create an engine or provide an existing engine.
+
+ Args:
+ namespace: The namespace associated with this record manager.
+ engine: An already existing SQL Alchemy engine.
+ Default is None.
+ db_url: A database connection string used to create
+ an SQL Alchemy engine. Default is None.
+ engine_kwargs: Additional keyword arguments
+ to be passed when creating the engine. Default is an empty dictionary.
+
+ Raises:
+ ValueError: If both db_url and engine are provided or neither.
+ AssertionError: If something unexpected happens during engine configuration.
+ """
+ super().__init__(namespace=namespace)
+ if db_url is None and engine is None:
+ raise ValueError("Must specify either db_url or engine")
+ if db_url is not None and engine is not None:
+ raise ValueError("Must specify either db_url or engine, not both")
+
+ if db_url:
+ _kwargs = engine_kwargs or {}
+ _engine = create_engine(db_url, **_kwargs)
+ elif engine:
+ _engine = engine
+ else:
+ raise AssertionError("Something went wrong with configuration of engine.")
+
+ self.engine = _engine
+ self.session_factory = sessionmaker(bind=self.engine)
+
+ def create_schema(self) -> None:
+ """Create the database schema."""
+ Base.metadata.create_all(self.engine)
+
+ @contextlib.contextmanager
+ def _make_session(self) -> Generator[Session, None, None]:
+ """Create a session and close it after use."""
+ session = self.session_factory()
+ try:
+ yield session
+ finally:
+ session.close()
+
+ def get_time(self) -> float:
+ """Get the current server time as a timestamp.
+
+ Please note it's critical that time is obtained from the server since
+ we want a monotonic clock.
+ """
+ with self._make_session() as session:
+ # * SQLite specific implementation, can be changed based on dialect.
+ # * For SQLite, unlike unixepoch it will work with older versions of SQLite.
+ # ----
+ # julianday('now'): Julian day number for the current date and time.
+ # The Julian day is a continuous count of days, starting from a
+ # reference date (Julian day number 0).
+ # 2440587.5 - constant represents the Julian day number for January 1, 1970
+ # 86400.0 - constant represents the number of seconds
+ # in a day (24 hours * 60 minutes * 60 seconds)
+ query = text("SELECT (julianday('now') - 2440587.5) * 86400.0;")
+ dt = session.execute(query).scalar()
+ if not isinstance(dt, float):
+ raise AssertionError(f"Unexpected type for datetime: {type(dt)}")
+ return dt
+
+ def update(
+ self,
+ keys: Sequence[str],
+ *,
+ group_ids: Optional[Sequence[Optional[str]]] = None,
+ time_at_least: Optional[float] = None,
+ ) -> None:
+ """Upsert records into the SQLite database."""
+ if group_ids is None:
+ group_ids = [None] * len(keys)
+
+ if len(keys) != len(group_ids):
+ raise ValueError(
+ f"Number of keys ({len(keys)}) does not match number of "
+ f"group_ids ({len(group_ids)})"
+ )
+
+ # Get the current time from the server.
+ # This makes an extra round trip to the server, should not be a big deal
+ # if the batch size is large enough.
+ # Getting the time here helps us compare it against the time_at_least
+ # and raise an error if there is a time sync issue.
+ # Here, we're just being extra careful to minimize the chance of
+ # data loss due to incorrectly deleting records.
+ update_time = self.get_time()
+
+ if time_at_least and update_time < time_at_least:
+ # Safeguard against time sync issues
+ raise AssertionError(f"Time sync issue: {update_time} < {time_at_least}")
+
+ records_to_upsert = [
+ {
+ "key": key,
+ "namespace": self.namespace,
+ "updated_at": update_time,
+ "group_id": group_id,
+ }
+ for key, group_id in zip(keys, group_ids)
+ ]
+
+ with self._make_session() as session:
+ # Note: uses SQLite insert to make on_conflict_do_update work.
+ # This code needs to be generalized a bit to work with more dialects.
+ insert_stmt = insert(UpsertionRecord).values(records_to_upsert)
+ stmt = insert_stmt.on_conflict_do_update( # type: ignore[attr-defined]
+ [UpsertionRecord.key, UpsertionRecord.namespace],
+ set_=dict(
+ # attr-defined type ignore
+ updated_at=insert_stmt.excluded.updated_at, # type: ignore
+ group_id=insert_stmt.excluded.group_id, # type: ignore
+ ),
+ )
+ session.execute(stmt)
+ session.commit()
+
+ def exists(self, keys: Sequence[str]) -> List[bool]:
+ """Check if the given keys exist in the SQLite database."""
+ with self._make_session() as session:
+ records = (
+ # mypy does not recognize .all()
+ session.query(UpsertionRecord.key) # type: ignore[attr-defined]
+ .filter(
+ and_(
+ UpsertionRecord.key.in_(keys),
+ UpsertionRecord.namespace == self.namespace,
+ )
+ )
+ .all()
+ )
+ found_keys = set(r.key for r in records)
+ return [k in found_keys for k in keys]
+
+ def list_keys(
+ self,
+ *,
+ before: Optional[float] = None,
+ after: Optional[float] = None,
+ group_ids: Optional[Sequence[str]] = None,
+ ) -> List[str]:
+ """List records in the SQLite database based on the provided date range."""
+ with self._make_session() as session:
+ query = session.query(UpsertionRecord).filter(
+ UpsertionRecord.namespace == self.namespace
+ )
+
+ # mypy does not recognize .all() or .filter()
+ if after:
+ query = query.filter( # type: ignore[attr-defined]
+ UpsertionRecord.updated_at > after
+ )
+ if before:
+ query = query.filter( # type: ignore[attr-defined]
+ UpsertionRecord.updated_at < before
+ )
+ if group_ids:
+ query = query.filter( # type: ignore[attr-defined]
+ UpsertionRecord.group_id.in_(group_ids)
+ )
+ records = query.all() # type: ignore[attr-defined]
+ return [r.key for r in records]
+
+ def delete_keys(self, keys: Sequence[str]) -> None:
+ """Delete records from the SQLite database."""
+ with self._make_session() as session:
+ # mypy does not recognize .delete()
+ session.query(UpsertionRecord).filter(
+ and_(
+ UpsertionRecord.key.in_(keys),
+ UpsertionRecord.namespace == self.namespace,
+ )
+ ).delete() # type: ignore[attr-defined]
+ session.commit()
diff --git a/libs/langchain/langchain/indexes/base.py b/libs/langchain/langchain/indexes/base.py
new file mode 100644
index 000000000..128455253
--- /dev/null
+++ b/libs/langchain/langchain/indexes/base.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import uuid
+from abc import ABC, abstractmethod
+from typing import List, Optional, Sequence
+
+NAMESPACE_UUID = uuid.UUID(int=1984)
+
+
+class RecordManager(ABC):
+ """An abstract base class representing the interface for a record manager."""
+
+ def __init__(
+ self,
+ namespace: str,
+ ) -> None:
+ """Initialize the record manager.
+
+ Args:
+ namespace (str): The namespace for the record manager.
+ """
+ self.namespace = namespace
+
+ @abstractmethod
+ def create_schema(self) -> None:
+ """Create the database schema for the record manager."""
+
+ @abstractmethod
+ def get_time(self) -> float:
+ """Get the current server time as a high resolution timestamp!
+
+ It's important to get this from the server to ensure a monotonic clock,
+ otherwise there may be data loss when cleaning up old documents!
+
+ Returns:
+ The current server time as a float timestamp.
+ """
+
+ @abstractmethod
+ def update(
+ self,
+ keys: Sequence[str],
+ *,
+ group_ids: Optional[Sequence[Optional[str]]] = None,
+ time_at_least: Optional[float] = None,
+ ) -> None:
+ """Upsert records into the database.
+
+ Args:
+ keys: A list of record keys to upsert.
+ group_ids: A list of group IDs corresponding to the keys.
+ time_at_least: if provided, updates should only happen if the
+ updated_at field is at least this time.
+
+ Raises:
+ ValueError: If the length of keys doesn't match the length of group_ids.
+ """
+
+ @abstractmethod
+ def exists(self, keys: Sequence[str]) -> List[bool]:
+ """Check if the provided keys exist in the database.
+
+ Args:
+ keys: A list of keys to check.
+
+ Returns:
+ A list of boolean values indicating the existence of each key.
+ """
+
+ @abstractmethod
+ def list_keys(
+ self,
+ *,
+ before: Optional[float] = None,
+ after: Optional[float] = None,
+ group_ids: Optional[Sequence[str]] = None,
+ ) -> List[str]:
+ """List records in the database based on the provided filters.
+
+ Args:
+ before: Filter to list records updated before this time.
+ after: Filter to list records updated after this time.
+ group_ids: Filter to list records with specific group IDs.
+
+ Returns:
+ A list of keys for the matching records.
+ """
+
+ @abstractmethod
+ def delete_keys(self, keys: Sequence[str]) -> None:
+ """Delete specified records from the database.
+
+ Args:
+ keys: A list of keys to delete.
+ """
diff --git a/libs/langchain/tests/unit_tests/indexes/test_api.py b/libs/langchain/tests/unit_tests/indexes/test_api.py
new file mode 100644
index 000000000..3104084d0
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/indexes/test_api.py
@@ -0,0 +1,13 @@
+from langchain.indexes import __all__
+
+
+def test_all() -> None:
+ """Use to catch obvious breaking changes."""
+ assert __all__ == sorted(__all__, key=str.lower)
+ assert __all__ == [
+ "GraphIndexCreator",
+ "index",
+ "IndexingResult",
+ "SQLRecordManager",
+ "VectorstoreIndexCreator",
+ ]
diff --git a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
new file mode 100644
index 000000000..24bbd1152
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
@@ -0,0 +1,50 @@
+import pytest
+
+from langchain.indexes._api import _HashedDocument
+from langchain.schema import Document
+
+
+def test_hashed_document_hashing() -> None:
+ hashed_document = _HashedDocument(
+ uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
+ )
+ assert isinstance(hashed_document.hash_, str)
+
+
+def test_hashing_with_missing_content() -> None:
+ """Check that ValueError is raised if page_content is missing."""
+ with pytest.raises(ValueError):
+ _HashedDocument(
+ metadata={"key": "value"},
+ )
+
+
+def test_uid_auto_assigned_to_hash() -> None:
+ """Test uid is auto-assigned to the hashed_document hash."""
+ hashed_document = _HashedDocument(
+ page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
+ )
+ assert hashed_document.uid == hashed_document.hash_
+
+
+def test_to_document() -> None:
+ """Test to_document method."""
+ hashed_document = _HashedDocument(
+ page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
+ )
+ doc = hashed_document.to_document()
+ assert isinstance(doc, Document)
+ assert doc.page_content == "Lorem ipsum dolor sit amet"
+ assert doc.metadata == {"key": "value"}
+
+
+def test_from_document() -> None:
+ """Test from document class method."""
+ document = Document(
+ page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
+ )
+
+ hashed_document = _HashedDocument.from_document(document)
+ # hash should be deterministic
+ assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
+ assert hashed_document.uid == hashed_document.hash_
diff --git a/libs/langchain/tests/unit_tests/indexes/test_indexing.py b/libs/langchain/tests/unit_tests/indexes/test_indexing.py
new file mode 100644
index 000000000..70ebf4fcb
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py
@@ -0,0 +1,474 @@
+from datetime import datetime
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Type
+from unittest.mock import patch
+
+import pytest
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.embeddings.base import Embeddings
+from langchain.indexes import index
+from langchain.indexes._sql_record_manager import SQLRecordManager
+from langchain.schema import Document
+from langchain.vectorstores.base import VST, VectorStore
+
+
+class ToyLoader(BaseLoader):
+ """Toy loader that always returns the same documents."""
+
+ def __init__(self, documents: Sequence[Document]) -> None:
+ """Initialize with the documents to return."""
+ self.documents = documents
+
+ def lazy_load(
+ self,
+ ) -> Iterator[Document]:
+ yield from self.documents
+
+ def load(self) -> List[Document]:
+ """Load the documents from the source."""
+ return list(self.lazy_load())
+
+
+class InMemoryVectorStore(VectorStore):
+ """In-memory implementation of VectorStore using a dictionary."""
+
+ def __init__(self) -> None:
+ """Vector store interface for testing things in memory."""
+ self.store: Dict[str, Document] = {}
+
+ def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
+ """Delete the given documents from the store using their IDs."""
+ if ids:
+ for _id in ids:
+ self.store.pop(_id, None)
+
+ def add_documents( # type: ignore
+ self,
+ documents: Sequence[Document],
+ *,
+ ids: Optional[Sequence[str]] = None,
+ **kwargs: Any,
+ ) -> None:
+ """Add the given documents to the store (insert behavior)."""
+ if ids and len(ids) != len(documents):
+ raise ValueError(
+ f"Expected {len(ids)} ids, got {len(documents)} documents."
+ )
+
+ if not ids:
+ raise NotImplementedError("This is not implemented yet.")
+
+ for _id, document in zip(ids, documents):
+ if _id in self.store:
+ raise ValueError(
+ f"Document with uid {_id} already exists in the store."
+ )
+ self.store[_id] = document
+
+ def add_texts(
+ self,
+ texts: Iterable[str],
+ metadatas: Optional[List[dict]] = None,
+ **kwargs: Any,
+ ) -> List[str]:
+ """Add the given texts to the store (insert behavior)."""
+ raise NotImplementedError()
+
+ @classmethod
+ def from_texts(
+ cls: Type[VST],
+ texts: List[str],
+ embedding: Embeddings,
+ metadatas: Optional[List[dict]] = None,
+ **kwargs: Any,
+ ) -> VST:
+ """Create a vector store from a list of texts."""
+ raise NotImplementedError()
+
+ def similarity_search(
+ self, query: str, k: int = 4, **kwargs: Any
+ ) -> List[Document]:
+ """Find the most similar documents to the given query."""
+ raise NotImplementedError()
+
+
+@pytest.fixture
+def record_manager() -> SQLRecordManager:
+ """Timestamped set fixture."""
+ record_manager = SQLRecordManager("kittens", db_url="sqlite:///:memory:")
+ record_manager.create_schema()
+ return record_manager
+
+
+@pytest.fixture
+def vector_store() -> InMemoryVectorStore:
+ """Vector store fixture."""
+ return InMemoryVectorStore()
+
+
+def test_indexing_same_content(
+ record_manager: SQLRecordManager, vector_store: InMemoryVectorStore
+) -> None:
+ """Indexing some content to confirm it gets added only once."""
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="This is a test document.",
+ ),
+ Document(
+ page_content="This is another document.",
+ ),
+ ]
+ )
+
+ assert index(loader, record_manager, vector_store) == {
+ "num_added": 2,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
+
+ assert len(list(vector_store.store)) == 2
+
+ for _ in range(2):
+ # Run the indexing again
+ assert index(loader, record_manager, vector_store) == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 2,
+ "num_updated": 0,
+ }
+
+
+def test_index_simple_delete_full(
+ record_manager: SQLRecordManager, vector_store: InMemoryVectorStore
+) -> None:
+ """Indexing some content to confirm it gets added only once."""
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="This is a test document.",
+ ),
+ Document(
+ page_content="This is another document.",
+ ),
+ ]
+ )
+
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
+ ):
+ assert index(loader, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 2,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
+
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
+ ):
+ assert index(loader, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 2,
+ "num_updated": 0,
+ }
+
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="mutated document 1",
+ ),
+ Document(
+ page_content="This is another document.", # <-- Same as original
+ ),
+ ]
+ )
+
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(loader, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 1,
+ "num_deleted": 1,
+ "num_skipped": 1,
+ "num_updated": 0,
+ }
+
+ doc_texts = set(
+ # Ignoring type since doc should be in the store and not a None
+ vector_store.store.get(uid).page_content # type: ignore
+ for uid in vector_store.store
+ )
+ assert doc_texts == {"mutated document 1", "This is another document."}
+
+ # Attempt to index again verify that nothing changes
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(loader, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 2,
+ "num_updated": 0,
+ }
+
+
+def test_incremental_fails_with_bad_source_ids(
+ record_manager: SQLRecordManager, vector_store: InMemoryVectorStore
+) -> None:
+ """Test indexing with incremental deletion strategy."""
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="This is a test document.",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is another document.",
+ metadata={"source": "2"},
+ ),
+ Document(
+ page_content="This is yet another document.",
+ metadata={"source": None},
+ ),
+ ]
+ )
+
+ with pytest.raises(ValueError):
+ # Should raise an error because no source id function was specified
+ index(loader, record_manager, vector_store, delete_mode="incremental")
+
+ with pytest.raises(ValueError):
+ # Should raise an error because no source id function was specified
+ index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode="incremental",
+ source_id_key="source",
+ )
+
+
+def test_no_delete(
+ record_manager: SQLRecordManager, vector_store: InMemoryVectorStore
+) -> None:
+ """Test indexing without a deletion strategy."""
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="This is a test document.",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is another document.",
+ metadata={"source": "2"},
+ ),
+ ]
+ )
+
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode=None,
+ source_id_key="source",
+ ) == {
+ "num_added": 2,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
+
+ # If we add the same content twice it should be skipped
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode=None,
+ source_id_key="source",
+ ) == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 2,
+ "num_updated": 0,
+ }
+
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="mutated content",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is another document.",
+ metadata={"source": "2"},
+ ),
+ ]
+ )
+
+ # Should result in no updates or deletions!
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode=None,
+ source_id_key="source",
+ ) == {
+ "num_added": 1,
+ "num_deleted": 0,
+ "num_skipped": 1,
+ "num_updated": 0,
+ }
+
+
+def test_incremental_delete(
+ record_manager: SQLRecordManager, vector_store: InMemoryVectorStore
+) -> None:
+ """Test indexing with incremental deletion strategy."""
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="This is a test document.",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is another document.",
+ metadata={"source": "2"},
+ ),
+ ]
+ )
+
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode="incremental",
+ source_id_key="source",
+ ) == {
+ "num_added": 2,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
+
+ doc_texts = set(
+ # Ignoring type since doc should be in the store and not a None
+ vector_store.store.get(uid).page_content # type: ignore
+ for uid in vector_store.store
+ )
+ assert doc_texts == {"This is another document.", "This is a test document."}
+
+ # Attempt to index again verify that nothing changes
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode="incremental",
+ source_id_key="source",
+ ) == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 2,
+ "num_updated": 0,
+ }
+
+ # Create 2 documents from the same source all with mutated content
+ loader = ToyLoader(
+ documents=[
+ Document(
+ page_content="mutated document 1",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="mutated document 2",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is another document.", # <-- Same as original
+ metadata={"source": "2"},
+ ),
+ ]
+ )
+
+ # Attempt to index again verify that nothing changes
+ with patch.object(
+ record_manager, "get_time", return_value=datetime(2021, 1, 3).timestamp()
+ ):
+ assert index(
+ loader,
+ record_manager,
+ vector_store,
+ delete_mode="incremental",
+ source_id_key="source",
+ ) == {
+ "num_added": 2,
+ "num_deleted": 1,
+ "num_skipped": 1,
+ "num_updated": 0,
+ }
+
+ doc_texts = set(
+ # Ignoring type since doc should be in the store and not a None
+ vector_store.store.get(uid).page_content # type: ignore
+ for uid in vector_store.store
+ )
+ assert doc_texts == {
+ "mutated document 1",
+ "mutated document 2",
+ "This is another document.",
+ }
+
+
+def test_indexing_with_no_docs(
+ record_manager: SQLRecordManager, vector_store: VectorStore
+) -> None:
+ """Check edge case when loader returns no new docs."""
+ loader = ToyLoader(documents=[])
+
+ assert index(loader, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 0,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
+
+
+def test_deduplication(
+ record_manager: SQLRecordManager, vector_store: VectorStore
+) -> None:
+ """Check edge case when loader returns no new docs."""
+ docs = [
+ Document(
+ page_content="This is a test document.",
+ metadata={"source": "1"},
+ ),
+ Document(
+ page_content="This is a test document.",
+ metadata={"source": "1"},
+ ),
+ ]
+
+ # Should result in only a single document being added
+ assert index(docs, record_manager, vector_store, delete_mode="full") == {
+ "num_added": 1,
+ "num_deleted": 0,
+ "num_skipped": 0,
+ "num_updated": 0,
+ }
diff --git a/libs/langchain/tests/unit_tests/indexes/test_sql_record_manager.py b/libs/langchain/tests/unit_tests/indexes/test_sql_record_manager.py
new file mode 100644
index 000000000..d7d95212a
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/indexes/test_sql_record_manager.py
@@ -0,0 +1,276 @@
+from datetime import datetime
+from unittest.mock import patch
+
+import pytest
+
+from langchain.indexes._sql_record_manager import SQLRecordManager, UpsertionRecord
+
+
+@pytest.fixture()
+def manager() -> SQLRecordManager:
+ """Initialize the test database and yield the TimestampedSet instance."""
+ # Initialize and yield the TimestampedSet instance
+ record_manager = SQLRecordManager("kittens", db_url="sqlite:///:memory:")
+ record_manager.create_schema()
+ return record_manager
+
+
+def test_update(manager: SQLRecordManager) -> None:
+ """Test updating records in the database."""
+ # no keys should be present in the set
+ read_keys = manager.list_keys()
+ assert read_keys == []
+ # Insert records
+ keys = ["key1", "key2", "key3"]
+ manager.update(keys)
+ # Retrieve the records
+ read_keys = manager.list_keys()
+ assert read_keys == ["key1", "key2", "key3"]
+
+
+def test_update_timestamp(manager: SQLRecordManager) -> None:
+ """Test updating records in the database."""
+ # no keys should be present in the set
+ with patch.object(
+ manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
+ ):
+ manager.update(["key1"])
+
+ with manager._make_session() as session:
+ records = (
+ session.query(UpsertionRecord)
+ .filter(UpsertionRecord.namespace == manager.namespace)
+ .all() # type: ignore[attr-defined]
+ )
+
+ assert [
+ {
+ "key": record.key,
+ "namespace": record.namespace,
+ "updated_at": record.updated_at,
+ "group_id": record.group_id,
+ }
+ for record in records
+ ] == [
+ {
+ "group_id": None,
+ "key": "key1",
+ "namespace": "kittens",
+ "updated_at": datetime(2021, 1, 2, 0, 0).timestamp(),
+ }
+ ]
+
+ with patch.object(
+ manager, "get_time", return_value=datetime(2023, 1, 2).timestamp()
+ ):
+ manager.update(["key1"])
+
+ with manager._make_session() as session:
+ records = (
+ session.query(UpsertionRecord)
+ .filter(UpsertionRecord.namespace == manager.namespace)
+ .all() # type: ignore[attr-defined]
+ )
+
+ assert [
+ {
+ "key": record.key,
+ "namespace": record.namespace,
+ "updated_at": record.updated_at,
+ "group_id": record.group_id,
+ }
+ for record in records
+ ] == [
+ {
+ "group_id": None,
+ "key": "key1",
+ "namespace": "kittens",
+ "updated_at": datetime(2023, 1, 2, 0, 0).timestamp(),
+ }
+ ]
+
+ with patch.object(
+ manager, "get_time", return_value=datetime(2023, 2, 2).timestamp()
+ ):
+ manager.update(["key1"], group_ids=["group1"])
+
+ with manager._make_session() as session:
+ records = (
+ session.query(UpsertionRecord)
+ .filter(UpsertionRecord.namespace == manager.namespace)
+ .all() # type: ignore[attr-defined]
+ )
+
+ assert [
+ {
+ "key": record.key,
+ "namespace": record.namespace,
+ "updated_at": record.updated_at,
+ "group_id": record.group_id,
+ }
+ for record in records
+ ] == [
+ {
+ "group_id": "group1",
+ "key": "key1",
+ "namespace": "kittens",
+ "updated_at": datetime(2023, 2, 2, 0, 0).timestamp(),
+ }
+ ]
+
+
+def test_update_with_group_ids(manager: SQLRecordManager) -> None:
+ """Test updating records in the database."""
+ # no keys should be present in the set
+ read_keys = manager.list_keys()
+ assert read_keys == []
+ # Insert records
+ keys = ["key1", "key2", "key3"]
+ manager.update(keys)
+ # Retrieve the records
+ read_keys = manager.list_keys()
+ assert read_keys == ["key1", "key2", "key3"]
+
+
+def test_exists(manager: SQLRecordManager) -> None:
+ """Test checking if keys exist in the database."""
+ # Insert records
+ keys = ["key1", "key2", "key3"]
+ manager.update(keys)
+ # Check if the keys exist in the database
+ exists = manager.exists(keys)
+ assert len(exists) == len(keys)
+ assert exists == [True, True, True]
+
+ exists = manager.exists(["key1", "key4"])
+ assert len(exists) == 2
+ assert exists == [True, False]
+
+
+def test_list_keys(manager: SQLRecordManager) -> None:
+ """Test listing keys based on the provided date range."""
+ # Insert records
+ assert manager.list_keys() == []
+ with manager._make_session() as session:
+ # Add some keys with explicit updated_ats
+ session.add(
+ UpsertionRecord(
+ key="key1",
+ updated_at=datetime(2021, 1, 1).timestamp(),
+ namespace="kittens",
+ )
+ )
+ session.add(
+ UpsertionRecord(
+ key="key2",
+ updated_at=datetime(2022, 1, 1).timestamp(),
+ namespace="kittens",
+ )
+ )
+ session.add(
+ UpsertionRecord(
+ key="key3",
+ updated_at=datetime(2023, 1, 1).timestamp(),
+ namespace="kittens",
+ )
+ )
+ session.add(
+ UpsertionRecord(
+ key="key4",
+ group_id="group1",
+ updated_at=datetime(2024, 1, 1).timestamp(),
+ namespace="kittens",
+ )
+ )
+ # Insert keys from a different namespace, these should not be visible!
+ session.add(
+ UpsertionRecord(
+ key="key1",
+ updated_at=datetime(2021, 1, 1).timestamp(),
+ namespace="puppies",
+ )
+ )
+ session.add(
+ UpsertionRecord(
+ key="key5",
+ updated_at=datetime(2021, 1, 1).timestamp(),
+ namespace="puppies",
+ )
+ )
+ session.commit()
+
+ # Retrieve all keys
+ assert manager.list_keys() == ["key1", "key2", "key3", "key4"]
+
+ # Retrieve keys updated after a certain date
+ assert manager.list_keys(after=datetime(2022, 2, 1).timestamp()) == ["key3", "key4"]
+
+ # Retrieve keys updated after a certain date
+ assert manager.list_keys(before=datetime(2022, 2, 1).timestamp()) == [
+ "key1",
+ "key2",
+ ]
+
+ # Retrieve keys updated after a certain date
+ assert manager.list_keys(before=datetime(2019, 2, 1).timestamp()) == []
+
+ # Retrieve keys in a time range
+ assert manager.list_keys(
+ before=datetime(2022, 2, 1).timestamp(),
+ after=datetime(2021, 11, 1).timestamp(),
+ ) == ["key2"]
+
+ assert manager.list_keys(group_ids=["group1", "group2"]) == ["key4"]
+
+ # Test multiple filters
+ assert (
+ manager.list_keys(
+ group_ids=["group1", "group2"], before=datetime(2019, 1, 1).timestamp()
+ )
+ == []
+ )
+ assert manager.list_keys(
+ group_ids=["group1", "group2"], after=datetime(2019, 1, 1).timestamp()
+ ) == ["key4"]
+
+
+def test_namespace_is_used(manager: SQLRecordManager) -> None:
+ """Verify that namespace is taken into account for all operations."""
+ assert manager.namespace == "kittens"
+ with manager._make_session() as session:
+ # Add some keys with explicit updated_ats
+ session.add(UpsertionRecord(key="key1", namespace="kittens"))
+ session.add(UpsertionRecord(key="key2", namespace="kittens"))
+ session.add(UpsertionRecord(key="key1", namespace="puppies"))
+ session.add(UpsertionRecord(key="key3", namespace="puppies"))
+ session.commit()
+
+ assert manager.list_keys() == ["key1", "key2"]
+ manager.delete_keys(["key1"])
+ assert manager.list_keys() == ["key2"]
+ manager.update(["key3"], group_ids=["group3"])
+
+ with manager._make_session() as session:
+ results = session.query(UpsertionRecord).all()
+
+ assert sorted([(r.namespace, r.key, r.group_id) for r in results]) == [
+ ("kittens", "key2", None),
+ ("kittens", "key3", "group3"),
+ ("puppies", "key1", None),
+ ("puppies", "key3", None),
+ ]
+
+
+def test_delete_keys(manager: SQLRecordManager) -> None:
+ """Test deleting keys from the database."""
+ # Insert records
+ keys = ["key1", "key2", "key3"]
+ manager.update(keys)
+
+ # Delete some keys
+ keys_to_delete = ["key1", "key2"]
+ manager.delete_keys(keys_to_delete)
+
+ # Check if the deleted keys are no longer in the database
+ remaining_keys = manager.list_keys()
+ assert remaining_keys == ["key3"]
From 9e1dbd4b490d423b8ed4fc699975d91cef2e7cfc Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev
Date: Wed, 23 Aug 2023 22:51:49 -0400
Subject: [PATCH 099/143] x
---
.../langchain/schema/runnable/_locals.py | 21 ++++++++++++++-----
.../langchain/schema/runnable/config.py | 11 +++++++---
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/_locals.py b/libs/langchain/langchain/schema/runnable/_locals.py
index 755a709fc..5b2f8e758 100644
--- a/libs/langchain/langchain/schema/runnable/_locals.py
+++ b/libs/langchain/langchain/schema/runnable/_locals.py
@@ -1,16 +1,27 @@
from __future__ import annotations
-from typing import Any, AsyncIterator, Dict, Iterator, Mapping, Optional, Union
-
-from langchain.callbacks.manager import (
- AsyncCallbackManagerForChainRun,
- CallbackManagerForChainRun,
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ AsyncIterator,
+ Dict,
+ Iterator,
+ Mapping,
+ Optional,
+ Union,
)
+
from langchain.load.serializable import Serializable
from langchain.schema.runnable.base import Input, Output, Runnable
from langchain.schema.runnable.config import RunnableConfig
from langchain.schema.runnable.passthrough import RunnablePassthrough
+if TYPE_CHECKING:
+ from langchain.callbacks.manager import (
+ AsyncCallbackManagerForChainRun,
+ CallbackManagerForChainRun,
+ )
+
class PutLocalVar(RunnablePassthrough):
key: Union[str, Mapping[str, str]]
diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py
index ce4e11861..a431fb635 100644
--- a/libs/langchain/langchain/schema/runnable/config.py
+++ b/libs/langchain/langchain/schema/runnable/config.py
@@ -3,10 +3,11 @@ from __future__ import annotations
from concurrent.futures import Executor, ThreadPoolExecutor
from contextlib import contextmanager
from copy import deepcopy
-from typing import Any, Dict, Generator, List, Optional, TypedDict
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, TypedDict
-from langchain.callbacks.base import BaseCallbackManager, Callbacks
-from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
+if TYPE_CHECKING:
+ from langchain.callbacks.base import BaseCallbackManager, Callbacks
+ from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
class RunnableConfig(TypedDict, total=False):
@@ -87,6 +88,8 @@ def patch_config(
def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
+ from langchain.callbacks.manager import CallbackManager
+
return CallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
inheritable_tags=config.get("tags"),
@@ -97,6 +100,8 @@ def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
def get_async_callback_manager_for_config(
config: RunnableConfig,
) -> AsyncCallbackManager:
+ from langchain.callbacks.manager import AsyncCallbackManager
+
return AsyncCallbackManager.configure(
inheritable_callbacks=config.get("callbacks"),
inheritable_tags=config.get("tags"),
From 25f2c82ae81a4b5f2bfe722ef69f9b57222c5944 Mon Sep 17 00:00:00 2001
From: seamusp
Date: Wed, 23 Aug 2023 22:36:54 -0700
Subject: [PATCH 100/143] docs:misc fixes (#9671)
Improve internal consistency in LangChain documentation
- Change occurrences of eg and eg. to e.g.
- Fix headers containing unnecessary capital letters.
- Change instances of "few shot" to "few-shot".
- Add periods to end of sentences where missing.
- Minor spelling and grammar fixes.
---
docs/api_reference/conf.py | 2 +-
.../docs/get_started/quickstart.mdx | 8 ++--
.../prompt_templates/few_shot_examples.mdx | 2 +-
.../prompts/prompt_templates/index.mdx | 2 +-
.../prompts/prompt_templates/partial.mdx | 2 +-
.../prompt_templates/prompt_composition.mdx | 4 +-
.../guides/expression_language/cookbook.ipynb | 2 +-
.../integrations/callbacks/promptlayer.ipynb | 2 +-
.../integrations/document_loaders/git.ipynb | 2 +-
.../extras/integrations/providers/datadog.mdx | 2 +-
docs/extras/modules/callbacks/tags.mdx | 2 +-
.../custom_example_selector.md | 2 +-
.../connecting_to_a_feature_store.ipynb | 16 ++++----
.../custom_prompt_template.ipynb | 8 ++--
.../few_shot_examples_chat.ipynb | 10 ++---
.../prompt_templates/format_output.mdx | 33 ++++++++-------
.../prompts/prompt_templates/formats.mdx | 2 +-
.../prompt_serialization.ipynb | 16 ++++----
.../prompt_templates/prompts_pipelining.ipynb | 40 +++++--------------
.../modules/callbacks/get_started.mdx | 6 +--
.../modules/chains/popular/sqlite.mdx | 6 +--
.../document_transformers/get_started.mdx | 2 +-
.../model_io/models/chat/how_to/prompts.mdx | 2 +-
.../prompt_templates/few_shot_examples.mdx | 14 +++----
.../prompts/prompt_templates/partial.mdx | 4 +-
25 files changed, 85 insertions(+), 106 deletions(-)
diff --git a/docs/api_reference/conf.py b/docs/api_reference/conf.py
index 3f1772a6b..25d11a075 100644
--- a/docs/api_reference/conf.py
+++ b/docs/api_reference/conf.py
@@ -156,7 +156,7 @@ html_context = {
html_static_path = ["_static"]
# These paths are either relative to html_static_path
-# or fully qualified paths (eg. https://...)
+# or fully qualified paths (e.g. https://...)
html_css_files = [
"css/custom.css",
]
diff --git a/docs/docs_skeleton/docs/get_started/quickstart.mdx b/docs/docs_skeleton/docs/get_started/quickstart.mdx
index 8cf778c94..0db8c03ee 100644
--- a/docs/docs_skeleton/docs/get_started/quickstart.mdx
+++ b/docs/docs_skeleton/docs/get_started/quickstart.mdx
@@ -107,7 +107,7 @@ import PromptTemplateChatModel from "@snippets/get_started/quickstart/prompt_tem
However, the advantages of using these over raw string formatting are several.
-You can "partial" out variables - eg you can format only some of the variables at a time.
+You can "partial" out variables - e.g. you can format only some of the variables at a time.
You can compose them together, easily combining different templates into a single prompt.
For explanations of these functionalities, see the [section on prompts](/docs/modules/model_io/prompts) for more detail.
@@ -121,12 +121,12 @@ Let's take a look at this below:
ChatPromptTemplates can also include other things besides ChatMessageTemplates - see the [section on prompts](/docs/modules/model_io/prompts) for more detail.
-## Output Parsers
+## Output parsers
OutputParsers convert the raw output of an LLM into a format that can be used downstream.
There are few main type of OutputParsers, including:
-- Convert text from LLM -> structured information (eg JSON)
+- Convert text from LLM -> structured information (e.g. JSON)
- Convert a ChatMessage into just a string
- Convert the extra information returned from a call besides the message (like OpenAI function invocation) into a string.
@@ -149,7 +149,7 @@ import LLMChain from "@snippets/get_started/quickstart/llm_chain.mdx"
-## Next Steps
+## Next steps
This is it!
We've now gone over how to create the core building block of LangChain applications - the LLMChains.
diff --git a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
index 3c5dfe3ec..ad02a01b7 100644
--- a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
+++ b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
@@ -1,6 +1,6 @@
# Few-shot prompt templates
-In this tutorial, we'll learn how to create a prompt template that uses few shot examples. A few shot prompt template can be constructed from either a set of examples, or from an Example Selector object.
+In this tutorial, we'll learn how to create a prompt template that uses few-shot examples. A few-shot prompt template can be constructed from either a set of examples, or from an Example Selector object.
import Example from "@snippets/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx"
diff --git a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/index.mdx b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/index.mdx
index 655577800..c64e73865 100644
--- a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/index.mdx
+++ b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/index.mdx
@@ -6,7 +6,7 @@ sidebar_position: 0
Prompt templates are pre-defined recipes for generating prompts for language models.
-A template may include instructions, few shot examples, and specific context and
+A template may include instructions, few-shot examples, and specific context and
questions appropriate for a given task.
LangChain provides tooling to create and work with prompt templates.
diff --git a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/partial.mdx b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/partial.mdx
index b76431dfc..ac2edea53 100644
--- a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/partial.mdx
+++ b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/partial.mdx
@@ -1,6 +1,6 @@
# Partial prompt templates
-Like other methods, it can make sense to "partial" a prompt template - eg pass in a subset of the required values, as to create a new prompt template which expects only the remaining subset of values.
+Like other methods, it can make sense to "partial" a prompt template - e.g. pass in a subset of the required values, as to create a new prompt template which expects only the remaining subset of values.
LangChain supports this in two ways:
1. Partial formatting with string values.
diff --git a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/prompt_composition.mdx b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/prompt_composition.mdx
index 439e26ea3..c1b96c107 100644
--- a/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/prompt_composition.mdx
+++ b/docs/docs_skeleton/docs/modules/model_io/prompts/prompt_templates/prompt_composition.mdx
@@ -2,8 +2,8 @@
This notebook goes over how to compose multiple prompts together. This can be useful when you want to reuse parts of prompts. This can be done with a PipelinePrompt. A PipelinePrompt consists of two main parts:
-- Final prompt: This is the final prompt that is returned
-- Pipeline prompts: This is a list of tuples, consisting of a string name and a prompt template. Each prompt template will be formatted and then passed to future prompt templates as a variable with the same name.
+- Final prompt: The final prompt that is returned
+- Pipeline prompts: A list of tuples, consisting of a string name and a prompt template. Each prompt template will be formatted and then passed to future prompt templates as a variable with the same name.
import Example from "@snippets/modules/model_io/prompts/prompt_templates/prompt_composition.mdx"
diff --git a/docs/extras/guides/expression_language/cookbook.ipynb b/docs/extras/guides/expression_language/cookbook.ipynb
index d7aad4291..04b74164d 100644
--- a/docs/extras/guides/expression_language/cookbook.ipynb
+++ b/docs/extras/guides/expression_language/cookbook.ipynb
@@ -1318,7 +1318,7 @@
"source": [
"template = \"\"\"Write some python code to solve the user's problem. \n",
"\n",
- "Return only python code in Markdown format, eg:\n",
+ "Return only python code in Markdown format, e.g.:\n",
"\n",
"```python\n",
"....\n",
diff --git a/docs/extras/integrations/callbacks/promptlayer.ipynb b/docs/extras/integrations/callbacks/promptlayer.ipynb
index f6d7cd976..2f3b5b9f8 100644
--- a/docs/extras/integrations/callbacks/promptlayer.ipynb
+++ b/docs/extras/integrations/callbacks/promptlayer.ipynb
@@ -11,7 +11,7 @@
"\n",
"[PromptLayer](https://promptlayer.com) is a an LLM observability platform that lets you visualize requests, version prompts, and track usage. In this guide we will go over how to setup the `PromptLayerCallbackHandler`. \n",
"\n",
- "While PromptLayer does have LLMs that integrate directly with LangChain (eg [`PromptLayerOpenAI`](https://python.langchain.com/docs/integrations/llms/promptlayer_openai)), this callback is the recommended way to integrate PromptLayer with LangChain.\n",
+ "While PromptLayer does have LLMs that integrate directly with LangChain (e.g. [`PromptLayerOpenAI`](https://python.langchain.com/docs/integrations/llms/promptlayer_openai)), this callback is the recommended way to integrate PromptLayer with LangChain.\n",
"\n",
"See [our docs](https://docs.promptlayer.com/languages/langchain) for more information."
]
diff --git a/docs/extras/integrations/document_loaders/git.ipynb b/docs/extras/integrations/document_loaders/git.ipynb
index 54d5df439..47382c564 100644
--- a/docs/extras/integrations/document_loaders/git.ipynb
+++ b/docs/extras/integrations/document_loaders/git.ipynb
@@ -173,7 +173,7 @@
"source": [
"from langchain.document_loaders import GitLoader\n",
"\n",
- "# eg. loading only python files\n",
+ "# e.g. loading only python files\n",
"loader = GitLoader(\n",
" repo_path=\"./example_data/test_repo1/\",\n",
" file_filter=lambda file_path: file_path.endswith(\".py\"),\n",
diff --git a/docs/extras/integrations/providers/datadog.mdx b/docs/extras/integrations/providers/datadog.mdx
index 59bd069c5..aee4d5e24 100644
--- a/docs/extras/integrations/providers/datadog.mdx
+++ b/docs/extras/integrations/providers/datadog.mdx
@@ -52,7 +52,7 @@ Note that using `ddtrace-run` or `patch_all()` will also enable the `requests` a
from ddtrace import config, patch
# Note: be sure to configure the integration before calling ``patch()``!
-# eg. config.langchain["logs_enabled"] = True
+# e.g. config.langchain["logs_enabled"] = True
patch(langchain=True)
diff --git a/docs/extras/modules/callbacks/tags.mdx b/docs/extras/modules/callbacks/tags.mdx
index f8bcc42da..87d9b7beb 100644
--- a/docs/extras/modules/callbacks/tags.mdx
+++ b/docs/extras/modules/callbacks/tags.mdx
@@ -1,3 +1,3 @@
# Tags
-You can add tags to your callbacks by passing a `tags` argument to the `call()`/`run()`/`apply()` methods. This is useful for filtering your logs, eg. if you want to log all requests made to a specific LLMChain, you can add a tag, and then filter your logs by that tag. You can pass tags to both constructor and request callbacks, see the examples above for details. These tags are then passed to the `tags` argument of the "start" callback methods, ie. `on_llm_start`, `on_chat_model_start`, `on_chain_start`, `on_tool_start`.
+You can add tags to your callbacks by passing a `tags` argument to the `call()`/`run()`/`apply()` methods. This is useful for filtering your logs, e.g. if you want to log all requests made to a specific LLMChain, you can add a tag, and then filter your logs by that tag. You can pass tags to both constructor and request callbacks, see the examples above for details. These tags are then passed to the `tags` argument of the "start" callback methods, ie. `on_llm_start`, `on_chat_model_start`, `on_chain_start`, `on_tool_start`.
diff --git a/docs/extras/modules/model_io/prompts/example_selectors/custom_example_selector.md b/docs/extras/modules/model_io/prompts/example_selectors/custom_example_selector.md
index 15f070a0f..d9bff1559 100644
--- a/docs/extras/modules/model_io/prompts/example_selectors/custom_example_selector.md
+++ b/docs/extras/modules/model_io/prompts/example_selectors/custom_example_selector.md
@@ -5,7 +5,7 @@ In this tutorial, we'll create a custom example selector that selects every alte
An `ExampleSelector` must implement two methods:
1. An `add_example` method which takes in an example and adds it into the ExampleSelector
-2. A `select_examples` method which takes in input variables (which are meant to be user input) and returns a list of examples to use in the few shot prompt.
+2. A `select_examples` method which takes in input variables (which are meant to be user input) and returns a list of examples to use in the few-shot prompt.
Let's implement a custom `ExampleSelector` that just selects two examples at random.
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/connecting_to_a_feature_store.ipynb b/docs/extras/modules/model_io/prompts/prompt_templates/connecting_to_a_feature_store.ipynb
index a549ce2bf..768c43e92 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/connecting_to_a_feature_store.ipynb
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/connecting_to_a_feature_store.ipynb
@@ -35,7 +35,7 @@
"source": [
"### Load Feast Store\n",
"\n",
- "Again, this should be set up according to the instructions in the Feast README"
+ "Again, this should be set up according to the instructions in the Feast README."
]
},
{
@@ -160,7 +160,7 @@
"source": [
"### Use in a chain\n",
"\n",
- "We can now use this in a chain, successfully creating a chain that achieves personalization backed by a feature store"
+ "We can now use this in a chain, successfully creating a chain that achieves personalization backed by a feature store."
]
},
{
@@ -243,7 +243,7 @@
"tags": []
},
"source": [
- "### Define and Load Features\n",
+ "### Define and load features\n",
"\n",
"We will use the user_transaction_counts Feature View from the [Tecton tutorial](https://docs.tecton.ai/docs/tutorials/tecton-fundamentals) as part of a Feature Service. For simplicity, we are only using a single Feature View; however, more sophisticated applications may require more feature views to retrieve the features needed for its prompt.\n",
"\n",
@@ -394,7 +394,7 @@
"source": [
"### Use in a chain\n",
"\n",
- "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the Tecton Feature Platform"
+ "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the Tecton Feature Platform."
]
},
{
@@ -460,7 +460,7 @@
"source": [
"## Featureform\n",
"\n",
- "Finally, we will use [Featureform](https://github.com/featureform/featureform) an open-source and enterprise-grade feature store to run the same example. Featureform allows you to work with your infrastructure like Spark or locally to define your feature transformations."
+ "Finally, we will use [Featureform](https://github.com/featureform/featureform), an open-source and enterprise-grade feature store, to run the same example. Featureform allows you to work with your infrastructure like Spark or locally to define your feature transformations."
]
},
{
@@ -564,7 +564,7 @@
"source": [
"### Use in a chain\n",
"\n",
- "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the Featureform Feature Platform"
+ "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the Featureform Feature Platform."
]
},
{
@@ -605,7 +605,7 @@
"source": [
"## AzureML Managed Feature Store\n",
"\n",
- "We will use [AzureML Managed Feature Store](https://learn.microsoft.com/en-us/azure/machine-learning/concept-what-is-managed-feature-store) to run the below example. "
+ "We will use [AzureML Managed Feature Store](https://learn.microsoft.com/en-us/azure/machine-learning/concept-what-is-managed-feature-store) to run the example below. "
]
},
{
@@ -768,7 +768,7 @@
"source": [
"### Use in a chain\n",
"\n",
- "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the AzureML Managed Feature Store"
+ "We can now use this in a chain, successfully creating a chain that achieves personalization backed by the AzureML Managed Feature Store."
]
},
{
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/custom_prompt_template.ipynb b/docs/extras/modules/model_io/prompts/prompt_templates/custom_prompt_template.ipynb
index c5044265a..7c9141a59 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/custom_prompt_template.ipynb
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/custom_prompt_template.ipynb
@@ -11,9 +11,7 @@
"\n",
"## Why are custom prompt templates needed?\n",
"\n",
- "LangChain provides a set of default prompt templates that can be used to generate prompts for a variety of tasks. However, there may be cases where the default prompt templates do not meet your needs. For example, you may want to create a prompt template with specific dynamic instructions for your language model. In such cases, you can create a custom prompt template.\n",
- "\n",
- "Take a look at the current set of default prompt templates [here](/docs/modules/model_io/prompts/prompt_templates/)."
+ "LangChain provides a set of [default prompt templates](/docs/modules/model_io/prompts/prompt_templates/) that can be used to generate prompts for a variety of tasks. However, there may be cases where the default prompt templates do not meet your needs. For example, you may want to create a prompt template with specific dynamic instructions for your language model. In such cases, you can create a custom prompt template."
]
},
{
@@ -21,7 +19,7 @@
"id": "5d56ce86",
"metadata": {},
"source": [
- "## Creating a Custom Prompt Template\n",
+ "## Creating a custom prompt template\n",
"\n",
"There are essentially two distinct prompt templates available - string prompt templates and chat prompt templates. String prompt templates provides a simple prompt in string format, while chat prompt templates produces a more structured prompt to be used with a chat API.\n",
"\n",
@@ -29,7 +27,7 @@
"\n",
"To create a custom string prompt template, there are two requirements:\n",
"1. It has an input_variables attribute that exposes what input variables the prompt template expects.\n",
- "2. It exposes a format method that takes in keyword arguments corresponding to the expected input_variables and returns the formatted prompt.\n",
+ "2. It defines a format method that takes in keyword arguments corresponding to the expected input_variables and returns the formatted prompt.\n",
"\n",
"We will create a custom prompt template that takes in the function name as input and formats the prompt to provide the source code of the function. To achieve this, let's first create a function that will return the source code of a function given its name."
]
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/few_shot_examples_chat.ipynb b/docs/extras/modules/model_io/prompts/prompt_templates/few_shot_examples_chat.ipynb
index 6e88c0157..c6050f326 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/few_shot_examples_chat.ipynb
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/few_shot_examples_chat.ipynb
@@ -5,9 +5,9 @@
"id": "bb0735c0",
"metadata": {},
"source": [
- "# Few shot examples for chat models\n",
+ "# Few-shot examples for chat models\n",
"\n",
- "This notebook covers how to use few shot examples in chat models. There does not appear to be solid consensus on how best to do few shot prompting, and the optimal prompt compilation will likely vary by model. Because of this, we provide few-shot prompt templates like the [FewShotChatMessagePromptTemplate](https://api.python.langchain.com/en/latest/prompts/langchain.prompts.few_shot.FewShotChatMessagePromptTemplate.html) as a flexible starting point, and you can modify or replace them as you see fit.\n",
+ "This notebook covers how to use few-shot examples in chat models. There does not appear to be solid consensus on how best to do few-shot prompting, and the optimal prompt compilation will likely vary by model. Because of this, we provide few-shot prompt templates like the [FewShotChatMessagePromptTemplate](https://api.python.langchain.com/en/latest/prompts/langchain.prompts.few_shot.FewShotChatMessagePromptTemplate.html) as a flexible starting point, and you can modify or replace them as you see fit.\n",
"\n",
"The goal of few-shot prompt templates are to dynamically select examples based on an input, and then format the examples in a final prompt to provide for the model.\n",
"\n",
@@ -133,7 +133,7 @@
"source": [
"final_prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
- " (\"system\", \"You are wonderous wizard of math.\"),\n",
+ " (\"system\", \"You are a wondrous wizard of math.\"),\n",
" few_shot_prompt,\n",
" (\"human\", \"{input}\"),\n",
" ]\n",
@@ -172,7 +172,7 @@
"id": "70ab7114-f07f-46be-8874-3705a25aba5f",
"metadata": {},
"source": [
- "## Dynamic Few-shot Prompting\n",
+ "## Dynamic few-shot prompting\n",
"\n",
"Sometimes you may want to condition which examples are shown based on the input. For this, you can replace the `examples` with an `example_selector`. The other components remain the same as above! To review, the dynamic few-shot prompt template would look like:\n",
"\n",
@@ -357,7 +357,7 @@
"source": [
"final_prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
- " (\"system\", \"You are wonderous wizard of math.\"),\n",
+ " (\"system\", \"You are a wondrous wizard of math.\"),\n",
" few_shot_prompt,\n",
" (\"human\", \"{input}\"),\n",
" ]\n",
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/format_output.mdx b/docs/extras/modules/model_io/prompts/prompt_templates/format_output.mdx
index 1be52b93c..38904076e 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/format_output.mdx
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/format_output.mdx
@@ -1,6 +1,6 @@
# Format template output
-The output of the format method is available as string, list of messages and `ChatPromptValue`
+The output of the format method is available as a string, list of messages and `ChatPromptValue`
As string:
@@ -26,22 +26,7 @@ output_2 = chat_prompt.format_prompt(input_language="English", output_language="
assert output == output_2
```
-As `ChatPromptValue`
-
-
-```python
-chat_prompt.format_prompt(input_language="English", output_language="French", text="I love programming.")
-```
-
-
-
-```
- ChatPromptValue(messages=[SystemMessage(content='You are a helpful assistant that translates English to French.', additional_kwargs={}), HumanMessage(content='I love programming.', additional_kwargs={})])
-```
-
-
-
-As list of Message objects
+As list of Message objects:
```python
@@ -57,3 +42,17 @@ chat_prompt.format_prompt(input_language="English", output_language="French", te
+As `ChatPromptValue`:
+
+
+```python
+chat_prompt.format_prompt(input_language="English", output_language="French", text="I love programming.")
+```
+
+
+
+```
+ ChatPromptValue(messages=[SystemMessage(content='You are a helpful assistant that translates English to French.', additional_kwargs={}), HumanMessage(content='I love programming.', additional_kwargs={})])
+```
+
+
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/formats.mdx b/docs/extras/modules/model_io/prompts/prompt_templates/formats.mdx
index 6abe8cbca..05ab55eae 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/formats.mdx
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/formats.mdx
@@ -1,4 +1,4 @@
-# Template Formats
+# Template formats
`PromptTemplate` by default uses Python f-string as its template format. However, it can also use other formats like `jinja2`, specified through the `template_format` argument.
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/prompt_serialization.ipynb b/docs/extras/modules/model_io/prompts/prompt_templates/prompt_serialization.ipynb
index 5317fe1c2..28bc342fe 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/prompt_serialization.ipynb
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/prompt_serialization.ipynb
@@ -11,7 +11,7 @@
"\n",
"At a high level, the following design principles are applied to serialization:\n",
"\n",
- "1. Both JSON and YAML are supported. We want to support serialization methods that are human readable on disk, and YAML and JSON are two of the most popular methods for that. Note that this rule applies to prompts. For other assets, like Examples, different serialization methods may be supported.\n",
+ "1. Both JSON and YAML are supported. We want to support serialization methods that are human readable on disk, and YAML and JSON are two of the most popular methods for that. Note that this rule applies to prompts. For other assets, like examples, different serialization methods may be supported.\n",
"\n",
"2. We support specifying everything in one file, or storing different components (templates, examples, etc) in different files and referencing them. For some cases, storing everything in file makes the most sense, but for others it is preferrable to split up some of the assets (long templates, large examples, reusable components). LangChain supports both.\n",
"\n",
@@ -144,7 +144,7 @@
"id": "d788a83c",
"metadata": {},
"source": [
- "### Loading Template from a File\n",
+ "### Loading template from a file\n",
"This shows an example of storing the template in a separate file and then referencing it in the config. Notice that the key changes from `template` to `template_path`."
]
},
@@ -214,7 +214,7 @@
"source": [
"## FewShotPromptTemplate\n",
"\n",
- "This section covers examples for loading few shot prompt templates."
+ "This section covers examples for loading few-shot prompt templates."
]
},
{
@@ -282,7 +282,7 @@
"metadata": {},
"source": [
"### Loading from YAML\n",
- "This shows an example of loading a few shot example from YAML."
+ "This shows an example of loading a few-shot example from YAML."
]
},
{
@@ -419,7 +419,7 @@
"metadata": {},
"source": [
"### Loading from JSON\n",
- "This shows an example of loading a few shot example from JSON."
+ "This shows an example of loading a few-shot example from JSON."
]
},
{
@@ -484,7 +484,7 @@
"id": "9d23faf4",
"metadata": {},
"source": [
- "### Examples in the Config\n",
+ "### Examples in the config\n",
"This shows an example of referencing the examples directly in the config."
]
},
@@ -553,7 +553,7 @@
"id": "2e86139e",
"metadata": {},
"source": [
- "### Example Prompt from a File\n",
+ "### Example prompt from a file\n",
"This shows an example of loading the PromptTemplate that is used to format the examples from a separate file. Note that the key changes from `example_prompt` to `example_prompt_path`."
]
},
@@ -637,7 +637,7 @@
"id": "c6e3f9fe",
"metadata": {},
"source": [
- "## PromptTempalte with OutputParser\n",
+ "## PromptTemplate with OutputParser\n",
"This shows an example of loading a prompt along with an OutputParser from a file."
]
},
diff --git a/docs/extras/modules/model_io/prompts/prompt_templates/prompts_pipelining.ipynb b/docs/extras/modules/model_io/prompts/prompt_templates/prompts_pipelining.ipynb
index 594a404e5..74316f704 100644
--- a/docs/extras/modules/model_io/prompts/prompt_templates/prompts_pipelining.ipynb
+++ b/docs/extras/modules/model_io/prompts/prompt_templates/prompts_pipelining.ipynb
@@ -5,9 +5,9 @@
"id": "4de4e022",
"metadata": {},
"source": [
- "# Prompt Pipelining\n",
+ "# Prompt pipelining\n",
"\n",
- "The idea behind prompt pipelining is to expose a user friendly interface for composing different parts of prompts together. You can do this with either string prompts or chat prompts. Constructing prompts this way allows for easy reuse of components."
+ "The idea behind prompt pipelining is to provide a user friendly interface for composing different parts of prompts together. You can do this with either string prompts or chat prompts. Constructing prompts this way allows for easy reuse of components."
]
},
{
@@ -15,26 +15,17 @@
"id": "c3190650",
"metadata": {},
"source": [
- "## String Prompt Pipelining\n",
+ "## String prompt pipelining\n",
"\n",
"When working with string prompts, each template is joined togther. You can work with either prompts directly or strings (the first element in the list needs to be a prompt)."
]
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "69b17f05",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.12) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from langchain.prompts import PromptTemplate"
]
@@ -160,7 +151,7 @@
"id": "4e4f6a8a",
"metadata": {},
"source": [
- "## Chat Prompt Pipelining"
+ "## Chat prompt pipelining"
]
},
{
@@ -173,19 +164,10 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "2a180f75",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.10) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate\n",
"from langchain.schema import HumanMessage, AIMessage, SystemMessage"
@@ -214,8 +196,8 @@
"id": "30656ef8",
"metadata": {},
"source": [
- "You can then easily create a pipeline combining it with other messages OR message templates.\n",
- "Use a `Message` when there is no variables to be formatted, use a `MessageTemplate` when there are variables to be formatted. You can also use just a string -> note that this will automatically get inferred as a HumanMessagePromptTemplate."
+ "You can then easily create a pipeline combining it with other messages *or* message templates.\n",
+ "Use a `Message` when there is no variables to be formatted, use a `MessageTemplate` when there are variables to be formatted. You can also use just a string (note: this will automatically get inferred as a HumanMessagePromptTemplate.)"
]
},
{
@@ -270,7 +252,7 @@
"id": "850357c0",
"metadata": {},
"source": [
- "You can also use it in an LLMChain, just like before"
+ "You can also use it in an LLMChain, just like before."
]
},
{
diff --git a/docs/snippets/modules/callbacks/get_started.mdx b/docs/snippets/modules/callbacks/get_started.mdx
index 7e4974da9..9596e2a37 100644
--- a/docs/snippets/modules/callbacks/get_started.mdx
+++ b/docs/snippets/modules/callbacks/get_started.mdx
@@ -130,10 +130,10 @@ chain.run(number=2, callbacks=[handler])
The `callbacks` argument is available on most objects throughout the API (Chains, Models, Tools, Agents, etc.) in two different places:
-- **Constructor callbacks**: defined in the constructor, eg. `LLMChain(callbacks=[handler], tags=['a-tag'])`, which will be used for all calls made on that object, and will be scoped to that object only, eg. if you pass a handler to the `LLMChain` constructor, it will not be used by the Model attached to that chain.
-- **Request callbacks**: defined in the `run()`/`apply()` methods used for issuing a request, eg. `chain.run(input, callbacks=[handler])`, which will be used for that specific request only, and all sub-requests that it contains (eg. a call to an LLMChain triggers a call to a Model, which uses the same handler passed in the `call()` method).
+- **Constructor callbacks**: defined in the constructor, e.g. `LLMChain(callbacks=[handler], tags=['a-tag'])`, which will be used for all calls made on that object, and will be scoped to that object only, e.g. if you pass a handler to the `LLMChain` constructor, it will not be used by the Model attached to that chain.
+- **Request callbacks**: defined in the `run()`/`apply()` methods used for issuing a request, e.g. `chain.run(input, callbacks=[handler])`, which will be used for that specific request only, and all sub-requests that it contains (e.g. a call to an LLMChain triggers a call to a Model, which uses the same handler passed in the `call()` method).
-The `verbose` argument is available on most objects throughout the API (Chains, Models, Tools, Agents, etc.) as a constructor argument, eg. `LLMChain(verbose=True)`, and it is equivalent to passing a `ConsoleCallbackHandler` to the `callbacks` argument of that object and all child objects. This is useful for debugging, as it will log all events to the console.
+The `verbose` argument is available on most objects throughout the API (Chains, Models, Tools, Agents, etc.) as a constructor argument, e.g. `LLMChain(verbose=True)`, and it is equivalent to passing a `ConsoleCallbackHandler` to the `callbacks` argument of that object and all child objects. This is useful for debugging, as it will log all events to the console.
### When do you want to use each of these?
diff --git a/docs/snippets/modules/chains/popular/sqlite.mdx b/docs/snippets/modules/chains/popular/sqlite.mdx
index 01024a5e4..a709328d1 100644
--- a/docs/snippets/modules/chains/popular/sqlite.mdx
+++ b/docs/snippets/modules/chains/popular/sqlite.mdx
@@ -628,7 +628,7 @@ local_chain("How many customers are there?")
-Even this relatively large model will most likely fail to generate more complicated SQL by itself. However, you can log its inputs and outputs so that you can hand-correct them and use the corrected examples for few shot prompt examples later. In practice, you could log any executions of your chain that raise exceptions (as shown in the example below) or get direct user feedback in cases where the results are incorrect (but did not raise an exception).
+Even this relatively large model will most likely fail to generate more complicated SQL by itself. However, you can log its inputs and outputs so that you can hand-correct them and use the corrected examples for few-shot prompt examples later. In practice, you could log any executions of your chain that raise exceptions (as shown in the example below) or get direct user feedback in cases where the results are incorrect (but did not raise an exception).
```bash
@@ -878,7 +878,7 @@ YAML_EXAMPLES = """
"""
```
-Now that you have some examples (with manually corrected output SQL), you can do few shot prompt seeding the usual way:
+Now that you have some examples (with manually corrected output SQL), you can do few-shot prompt seeding the usual way:
```python
@@ -925,7 +925,7 @@ few_shot_prompt = FewShotPromptTemplate(
-The model should do better now with this few shot prompt, especially for inputs similar to the examples you have seeded it with.
+The model should do better now with this few-shot prompt, especially for inputs similar to the examples you have seeded it with.
```python
diff --git a/docs/snippets/modules/data_connection/document_transformers/get_started.mdx b/docs/snippets/modules/data_connection/document_transformers/get_started.mdx
index faafa4500..266aa29e5 100644
--- a/docs/snippets/modules/data_connection/document_transformers/get_started.mdx
+++ b/docs/snippets/modules/data_connection/document_transformers/get_started.mdx
@@ -4,7 +4,7 @@ In addition to controlling which characters you can split on, you can also contr
- `length_function`: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.
- `chunk_size`: the maximum size of your chunks (as measured by the length function).
-- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (eg do a sliding window).
+- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (e.g. do a sliding window).
- `add_start_index`: whether to include the starting position of each chunk within the original document in the metadata.
diff --git a/docs/snippets/modules/model_io/models/chat/how_to/prompts.mdx b/docs/snippets/modules/model_io/models/chat/how_to/prompts.mdx
index b29643512..a02c7b4e2 100644
--- a/docs/snippets/modules/model_io/models/chat/how_to/prompts.mdx
+++ b/docs/snippets/modules/model_io/models/chat/how_to/prompts.mdx
@@ -34,7 +34,7 @@ chat(chat_prompt.format_prompt(input_language="English", output_language="French
-If you wanted to construct the MessagePromptTemplate more directly, you could create a PromptTemplate outside and then pass it in, eg:
+If you wanted to construct the MessagePromptTemplate more directly, you could create a PromptTemplate outside and then pass it in, e.g.:
```python
diff --git a/docs/snippets/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx b/docs/snippets/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
index e14aafd2f..4e20db3e3 100644
--- a/docs/snippets/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
+++ b/docs/snippets/modules/model_io/prompts/prompt_templates/few_shot_examples.mdx
@@ -1,13 +1,13 @@
### Use Case
-In this tutorial, we'll configure few shot examples for self-ask with search.
+In this tutorial, we'll configure few-shot examples for self-ask with search.
## Using an example set
### Create the example set
-To get started, create a list of few shot examples. Each example should be a dictionary with the keys being the input variables and the values being the values for those input variables.
+To get started, create a list of few-shot examples. Each example should be a dictionary with the keys being the input variables and the values being the values for those input variables.
```python
from langchain.prompts.few_shot import FewShotPromptTemplate
@@ -69,9 +69,9 @@ So the final answer is: No
]
```
-### Create a formatter for the few shot examples
+### Create a formatter for the few-shot examples
-Configure a formatter that will format the few shot examples into a string. This formatter should be a `PromptTemplate` object.
+Configure a formatter that will format the few-shot examples into a string. This formatter should be a `PromptTemplate` object.
```python
@@ -98,7 +98,7 @@ print(example_prompt.format(**examples[0]))
### Feed examples and formatter to `FewShotPromptTemplate`
-Finally, create a `FewShotPromptTemplate` object. This object takes in the few shot examples and the formatter for the few shot examples.
+Finally, create a `FewShotPromptTemplate` object. This object takes in the few-shot examples and the formatter for the few-shot examples.
```python
@@ -171,7 +171,7 @@ print(prompt.format(input="Who was the father of Mary Ball Washington?"))
We will reuse the example set and the formatter from the previous section. However, instead of feeding the examples directly into the `FewShotPromptTemplate` object, we will feed them into an `ExampleSelector` object.
-In this tutorial, we will use the `SemanticSimilarityExampleSelector` class. This class selects few shot examples based on their similarity to the input. It uses an embedding model to compute the similarity between the input and the few shot examples, as well as a vector store to perform the nearest neighbor search.
+In this tutorial, we will use the `SemanticSimilarityExampleSelector` class. This class selects few-shot examples based on their similarity to the input. It uses an embedding model to compute the similarity between the input and the few-shot examples, as well as a vector store to perform the nearest neighbor search.
```python
@@ -224,7 +224,7 @@ for example in selected_examples:
### Feed example selector into `FewShotPromptTemplate`
-Finally, create a `FewShotPromptTemplate` object. This object takes in the example selector and the formatter for the few shot examples.
+Finally, create a `FewShotPromptTemplate` object. This object takes in the example selector and the formatter for the few-shot examples.
```python
diff --git a/docs/snippets/modules/model_io/prompts/prompt_templates/partial.mdx b/docs/snippets/modules/model_io/prompts/prompt_templates/partial.mdx
index b791a220f..120340f37 100644
--- a/docs/snippets/modules/model_io/prompts/prompt_templates/partial.mdx
+++ b/docs/snippets/modules/model_io/prompts/prompt_templates/partial.mdx
@@ -1,4 +1,4 @@
-## Partial With Strings
+## Partial with strings
One common use case for wanting to partial a prompt template is if you get some of the variables before others. For example, suppose you have a prompt template that requires two variables, `foo` and `baz`. If you get the `foo` value early on in the chain, but the `baz` value later, it can be annoying to wait until you have both variables in the same place to pass them to the prompt template. Instead, you can partial the prompt template with the `foo` value, and then pass the partialed prompt template along and just use that. Below is an example of doing this:
@@ -40,7 +40,7 @@ print(prompt.format(bar="baz"))
-## Partial With Functions
+## Partial with functions
The other common use is to partial with a function. The use case for this is when you have a variable you know that you always want to fetch in a common way. A prime example of this is with date or time. Imagine you have a prompt which you always want to have the current date. You can't hard code it in the prompt, and passing it along with the other input variables is a bit annoying. In this case, it's very handy to be able to partial the prompt with a function that always returns the current date.
From 5990651070d82a1f09b6856a54ac6c2540dc48c3 Mon Sep 17 00:00:00 2001
From: Patrick Loeber <50772274+patrickloeber@users.noreply.github.com>
Date: Thu, 24 Aug 2023 07:51:19 +0200
Subject: [PATCH 101/143] Add new document_loader:
AssemblyAIAudioTranscriptLoader (#9667)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This PR adds a new document loader `AssemblyAIAudioTranscriptLoader`
that allows to transcribe audio files with the [AssemblyAI
API](https://www.assemblyai.com) and loads the transcribed text into
documents.
- Add new document_loader with class `AssemblyAIAudioTranscriptLoader`
- Add optional dependency `assemblyai`
- Add unit tests (using a Mock client)
- Add docs notebook
This is the equivalent to the JS integration already available in
LangChain.js. See the [LangChain JS docs AssemblyAI
page](https://js.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_loaders/assemblyai_audio_transcription).
At its simplest, you can use the loader to get a transcript back from an
audio file like this:
```python
from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader
loader = AssemblyAIAudioTranscriptLoader(file_path="./testfile.mp3")
docs = loader.load()
```
To use it, it needs the `assemblyai` python package installed, and the
environment variable `ASSEMBLYAI_API_KEY` set with your API key.
Alternatively, the API key can also be passed as an argument.
Twitter handles to shout out if so kindly 🙇
[@AssemblyAI](https://twitter.com/AssemblyAI) and
[@patloeber](https://twitter.com/patloeber)
---------
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev
---
.../document_loaders/assemblyai.ipynb | 224 ++++++++++++++++++
.../langchain/document_loaders/__init__.py | 2 +
.../langchain/document_loaders/assemblyai.py | 111 +++++++++
libs/langchain/poetry.lock | 104 +++++++-
libs/langchain/pyproject.toml | 2 +
.../document_loaders/test_assemblyai.py | 50 ++++
6 files changed, 491 insertions(+), 2 deletions(-)
create mode 100644 docs/extras/integrations/document_loaders/assemblyai.ipynb
create mode 100644 libs/langchain/langchain/document_loaders/assemblyai.py
create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
diff --git a/docs/extras/integrations/document_loaders/assemblyai.ipynb b/docs/extras/integrations/document_loaders/assemblyai.ipynb
new file mode 100644
index 000000000..33fdef929
--- /dev/null
+++ b/docs/extras/integrations/document_loaders/assemblyai.ipynb
@@ -0,0 +1,224 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# AssemblyAI Audio Transcripts\n",
+ "\n",
+ "The `AssemblyAIAudioTranscriptLoader` allows to transcribe audio files with the [AssemblyAI API](https://www.assemblyai.com) and loads the transcribed text into documents.\n",
+ "\n",
+ "To use it, you should have the `assemblyai` python package installed, and the\n",
+ "environment variable `ASSEMBLYAI_API_KEY` set with your API key. Alternatively, the API key can also be passed as an argument.\n",
+ "\n",
+ "More info about AssemblyAI:\n",
+ "\n",
+ "- [Website](https://www.assemblyai.com/)\n",
+ "- [Get a Free API key](https://www.assemblyai.com/dashboard/signup)\n",
+ "- [AssemblyAI API Docs](https://www.assemblyai.com/docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Installation\n",
+ "\n",
+ "First, you need to install the `assemblyai` python package.\n",
+ "\n",
+ "You can find more info about it inside the [assemblyai-python-sdk GitHub repo](https://github.com/AssemblyAI/assemblyai-python-sdk)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install assemblyai"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Example\n",
+ "\n",
+ "The `AssemblyAIAudioTranscriptLoader` needs at least the `file_path` argument. Audio files can be specified as an URL or a local file path."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader\n",
+ "\n",
+ "audio_file = \"https://storage.googleapis.com/aai-docs-samples/nbc.mp3\"\n",
+ "# or a local file path: audio_file = \"./nbc.mp3\"\n",
+ "\n",
+ "loader = AssemblyAIAudioTranscriptLoader(file_path=audio_file)\n",
+ "\n",
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note: Calling `loader.load()` blocks until the transcription is finished."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The transcribed text is available in the `page_content`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs[0].page_content"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "\"Load time, a new president and new congressional makeup. Same old ...\"\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The `metadata` contains the full JSON response with more meta information:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs[0].metadata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "{'language_code': ,\n",
+ " 'audio_url': 'https://storage.googleapis.com/aai-docs-samples/nbc.mp3',\n",
+ " 'punctuate': True,\n",
+ " 'format_text': True,\n",
+ " ...\n",
+ "}\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Transcript Formats\n",
+ "\n",
+ "You can specify the `transcript_format` argument for different formats.\n",
+ "\n",
+ "Depending on the format, one or more documents are returned. These are the different `TranscriptFormat` options:\n",
+ "\n",
+ "- `TEXT`: One document with the transcription text\n",
+ "- `SENTENCES`: Multiple documents, splits the transcription by each sentence\n",
+ "- `PARAGRAPHS`: Multiple documents, splits the transcription by each paragraph\n",
+ "- `SUBTITLES_SRT`: One document with the transcript exported in SRT subtitles format\n",
+ "- `SUBTITLES_VTT`: One document with the transcript exported in VTT subtitles format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders.assemblyai import (\n",
+ " AssemblyAIAudioTranscriptLoader,\n",
+ " TranscriptFormat,\n",
+ ")\n",
+ "\n",
+ "loader = AssemblyAIAudioTranscriptLoader(\n",
+ " file_path=\"./your_file.mp3\",\n",
+ " transcript_format=TranscriptFormat.SENTENCES,\n",
+ ")\n",
+ "\n",
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Transcription Config\n",
+ "\n",
+ "You can also specify the `config` argument to use different audio intelligence models.\n",
+ "\n",
+ "Visit the [AssemblyAI API Documentation](https://www.assemblyai.com/docs) to get an overview of all available models!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import assemblyai as aai\n",
+ "\n",
+ "config = aai.TranscriptionConfig(speaker_labels=True,\n",
+ " auto_chapters=True,\n",
+ " entity_detection=True\n",
+ ")\n",
+ "\n",
+ "loader = AssemblyAIAudioTranscriptLoader(\n",
+ " file_path=\"./your_file.mp3\",\n",
+ " config=config\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Pass the API Key as argument\n",
+ "\n",
+ "Next to setting the API key as environment variable `ASSEMBLYAI_API_KEY`, it is also possible to pass it as argument."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = AssemblyAIAudioTranscriptLoader(\n",
+ " file_path=\"./your_file.mp3\",\n",
+ " api_key=\"YOUR_KEY\"\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py
index 30f69659c..ab9d37191 100644
--- a/libs/langchain/langchain/document_loaders/__init__.py
+++ b/libs/langchain/langchain/document_loaders/__init__.py
@@ -31,6 +31,7 @@ from langchain.document_loaders.airtable import AirtableLoader
from langchain.document_loaders.apify_dataset import ApifyDatasetLoader
from langchain.document_loaders.arcgis_loader import ArcGISLoader
from langchain.document_loaders.arxiv import ArxivLoader
+from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader
from langchain.document_loaders.async_html import AsyncHtmlLoader
from langchain.document_loaders.azlyrics import AZLyricsLoader
from langchain.document_loaders.azure_blob_storage_container import (
@@ -219,6 +220,7 @@ __all__ = [
"ApifyDatasetLoader",
"ArcGISLoader",
"ArxivLoader",
+ "AssemblyAIAudioTranscriptLoader",
"AsyncHtmlLoader",
"AzureBlobStorageContainerLoader",
"AzureBlobStorageFileLoader",
diff --git a/libs/langchain/langchain/document_loaders/assemblyai.py b/libs/langchain/langchain/document_loaders/assemblyai.py
new file mode 100644
index 000000000..d7b7ecb9b
--- /dev/null
+++ b/libs/langchain/langchain/document_loaders/assemblyai.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import TYPE_CHECKING, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+if TYPE_CHECKING:
+ import assemblyai
+
+
+class TranscriptFormat(Enum):
+ """Transcript format to use for the document loader."""
+
+ TEXT = "text"
+ """One document with the transcription text"""
+ SENTENCES = "sentences"
+ """Multiple documents, splits the transcription by each sentence"""
+ PARAGRAPHS = "paragraphs"
+ """Multiple documents, splits the transcription by each paragraph"""
+ SUBTITLES_SRT = "subtitles_srt"
+ """One document with the transcript exported in SRT subtitles format"""
+ SUBTITLES_VTT = "subtitles_vtt"
+ """One document with the transcript exported in VTT subtitles format"""
+
+
+class AssemblyAIAudioTranscriptLoader(BaseLoader):
+ """
+ Loader for AssemblyAI audio transcripts.
+
+ It uses the AssemblyAI API to transcribe audio files
+ and loads the transcribed text into one or more Documents,
+ depending on the specified format.
+
+ To use, you should have the ``assemblyai`` python package installed, and the
+ environment variable ``ASSEMBLYAI_API_KEY`` set with your API key.
+ Alternatively, the API key can also be passed as an argument.
+
+ Audio files can be specified via an URL or a local file path.
+ """
+
+ def __init__(
+ self,
+ file_path: str,
+ *,
+ transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
+ config: Optional[assemblyai.TranscriptionConfig] = None,
+ api_key: Optional[str] = None,
+ ):
+ """
+ Initializes the AssemblyAI AudioTranscriptLoader.
+
+ Args:
+ file_path: An URL or a local file path.
+ transcript_format: Transcript format to use.
+ See class ``TranscriptFormat`` for more info.
+ config: Transcription options and features. If ``None`` is given,
+ the Transcriber's default configuration will be used.
+ api_key: AssemblyAI API key.
+ """
+ try:
+ import assemblyai
+ except ImportError:
+ raise ImportError(
+ "Could not import assemblyai python package. "
+ "Please install it with `pip install assemblyai`."
+ )
+ if api_key is not None:
+ assemblyai.settings.api_key = api_key
+
+ self.file_path = file_path
+ self.transcript_format = transcript_format
+ self.transcriber = assemblyai.Transcriber(config=config)
+
+ def load(self) -> List[Document]:
+ """Transcribes the audio file and loads the transcript into documents.
+
+ It uses the AssemblyAI API to transcribe the audio file and blocks until
+ the transcription is finished.
+ """
+ transcript = self.transcriber.transcribe(self.file_path)
+ # This will raise a ValueError if no API key is set.
+
+ if transcript.error:
+ raise ValueError(f"Could not transcribe file: {transcript.error}")
+
+ if self.transcript_format == TranscriptFormat.TEXT:
+ return [
+ Document(
+ page_content=transcript.text, metadata=transcript.json_response
+ )
+ ]
+ elif self.transcript_format == TranscriptFormat.SENTENCES:
+ sentences = transcript.get_sentences()
+ return [
+ Document(page_content=s.text, metadata=s.dict(exclude={"text"}))
+ for s in sentences
+ ]
+ elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
+ paragraphs = transcript.get_paragraphs()
+ return [
+ Document(page_content=p.text, metadata=p.dict(exclude={"text"}))
+ for p in paragraphs
+ ]
+ elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
+ return [Document(page_content=transcript.export_subtitles_srt())]
+ elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
+ return [Document(page_content=transcript.export_subtitles_vtt())]
+ else:
+ raise ValueError("Unknown transcript format.")
diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock
index 3ea9c4734..5b10733d9 100644
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@@ -436,6 +436,26 @@ files = [
[package.dependencies]
feedparser = "*"
+[[package]]
+name = "assemblyai"
+version = "0.17.0"
+description = "AssemblyAI Python SDK"
+optional = true
+python-versions = ">=3.8"
+files = [
+ {file = "assemblyai-0.17.0-py3-none-any.whl", hash = "sha256:3bad8cc7545b5b831f243f1b2f01bc4cc0e8aad78babf44c8008f2293c540e36"},
+ {file = "assemblyai-0.17.0.tar.gz", hash = "sha256:6d5bbfbbaa626ed021c3d3dec0ca52b3ebf6e6ef277ac76a7a6aed52182d531e"},
+]
+
+[package.dependencies]
+httpx = ">=0.19.0"
+pydantic = ">=1.7.0,<1.10.7 || >1.10.7"
+typing-extensions = ">=3.7"
+websockets = ">=11.0"
+
+[package.extras]
+extras = ["pyaudio (>=0.2.13)"]
+
[[package]]
name = "asttokens"
version = "2.2.1"
@@ -3522,6 +3542,7 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
+ {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@@ -9857,6 +9878,85 @@ docs = ["Sphinx (>=6.0)", "sphinx-rtd-theme (>=1.1.0)"]
optional = ["python-socks", "wsaccel"]
test = ["websockets"]
+[[package]]
+name = "websockets"
+version = "11.0.3"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = true
+python-versions = ">=3.7"
+files = [
+ {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac"},
+ {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d"},
+ {file = "websockets-11.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f"},
+ {file = "websockets-11.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564"},
+ {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11"},
+ {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca"},
+ {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54"},
+ {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4"},
+ {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526"},
+ {file = "websockets-11.0.3-cp310-cp310-win32.whl", hash = "sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69"},
+ {file = "websockets-11.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f"},
+ {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb"},
+ {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288"},
+ {file = "websockets-11.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d"},
+ {file = "websockets-11.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3"},
+ {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b"},
+ {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6"},
+ {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97"},
+ {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf"},
+ {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd"},
+ {file = "websockets-11.0.3-cp311-cp311-win32.whl", hash = "sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c"},
+ {file = "websockets-11.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8"},
+ {file = "websockets-11.0.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152"},
+ {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f"},
+ {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b"},
+ {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb"},
+ {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007"},
+ {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0"},
+ {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af"},
+ {file = "websockets-11.0.3-cp37-cp37m-win32.whl", hash = "sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f"},
+ {file = "websockets-11.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de"},
+ {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0"},
+ {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae"},
+ {file = "websockets-11.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99"},
+ {file = "websockets-11.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa"},
+ {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86"},
+ {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c"},
+ {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0"},
+ {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e"},
+ {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788"},
+ {file = "websockets-11.0.3-cp38-cp38-win32.whl", hash = "sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74"},
+ {file = "websockets-11.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f"},
+ {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8"},
+ {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd"},
+ {file = "websockets-11.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016"},
+ {file = "websockets-11.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61"},
+ {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b"},
+ {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd"},
+ {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7"},
+ {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1"},
+ {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311"},
+ {file = "websockets-11.0.3-cp39-cp39-win32.whl", hash = "sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128"},
+ {file = "websockets-11.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e"},
+ {file = "websockets-11.0.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf"},
+ {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5"},
+ {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998"},
+ {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b"},
+ {file = "websockets-11.0.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb"},
+ {file = "websockets-11.0.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20"},
+ {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931"},
+ {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9"},
+ {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280"},
+ {file = "websockets-11.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b"},
+ {file = "websockets-11.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82"},
+ {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c"},
+ {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d"},
+ {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4"},
+ {file = "websockets-11.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602"},
+ {file = "websockets-11.0.3-py3-none-any.whl", hash = "sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6"},
+ {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"},
+]
+
[[package]]
name = "werkzeug"
version = "2.3.7"
@@ -10338,7 +10438,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
-extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
+extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
javascript = ["esprima"]
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"]
@@ -10348,4 +10448,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
-content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b"
+content-hash = "fd56d0cf338f6efea449244f3e9e719ca6872dd4b3e136ccd67fd82912912cc2"
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index 296cbae35..60d8afb5a 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -127,6 +127,7 @@ xata = {version = "^1.0.0a7", optional = true}
xmltodict = {version = "^0.13.0", optional = true}
google-api-core = {version = "^2.11.1", optional = true}
markdownify = {version = "^0.11.6", optional = true}
+assemblyai = {version = "^0.17.0", optional = true}
[tool.poetry.group.test.dependencies]
@@ -299,6 +300,7 @@ all = [
# merge-conflicts
extended_testing = [
"amazon-textract-caller",
+ "assemblyai",
"beautifulsoup4",
"bibtexparser",
"cassio",
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
new file mode 100644
index 000000000..a9b6112e7
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
@@ -0,0 +1,50 @@
+import pytest
+from pytest_mock import MockerFixture
+
+from langchain.document_loaders.assemblyai import (
+ AssemblyAIAudioTranscriptLoader,
+ TranscriptFormat,
+)
+
+
+@pytest.mark.requires("assemblyai")
+def test_initialization() -> None:
+ loader = AssemblyAIAudioTranscriptLoader(
+ file_path="./testfile.mp3", api_key="api_key"
+ )
+ assert loader.file_path == "./testfile.mp3"
+ assert loader.transcript_format == TranscriptFormat.TEXT
+
+
+@pytest.mark.requires("assemblyai")
+def test_load(mocker: MockerFixture) -> None:
+ mocker.patch(
+ "assemblyai.Transcriber.transcribe",
+ return_value=mocker.MagicMock(
+ text="Test transcription text", json_response={"id": "1"}, error=None
+ ),
+ )
+
+ loader = AssemblyAIAudioTranscriptLoader(
+ file_path="./testfile.mp3", api_key="api_key"
+ )
+ docs = loader.load()
+ assert len(docs) == 1
+ assert docs[0].page_content == "Test transcription text"
+ assert docs[0].metadata == {"id": "1"}
+
+
+@pytest.mark.requires("assemblyai")
+def test_transcription_error(mocker: MockerFixture) -> None:
+ mocker.patch(
+ "assemblyai.Transcriber.transcribe",
+ return_value=mocker.MagicMock(error="Test error"),
+ )
+
+ loader = AssemblyAIAudioTranscriptLoader(
+ file_path="./testfile.mp3", api_key="api_key"
+ )
+
+ expected_error = "Could not transcribe file: Test error"
+ with pytest.raises(ValueError, match=expected_error):
+ loader.load()
From d0ff0db69829bb2324e4cf067b10a98bce5acf64 Mon Sep 17 00:00:00 2001
From: Kim Minjong
Date: Thu, 24 Aug 2023 14:58:14 +0900
Subject: [PATCH 102/143] Update ChatOpenAI._stream to respect finish_reason
(#9672)
Currently, ChatOpenAI._stream does not reflect finish_reason to
generation_info. Change it to reflect that.
Same patch as https://github.com/langchain-ai/langchain/pull/9431 , but
also applies to _stream.
---
libs/langchain/langchain/chat_models/openai.py | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/libs/langchain/langchain/chat_models/openai.py b/libs/langchain/langchain/chat_models/openai.py
index c2c7880ed..7cb1947cf 100644
--- a/libs/langchain/langchain/chat_models/openai.py
+++ b/libs/langchain/langchain/chat_models/openai.py
@@ -307,10 +307,16 @@ class ChatOpenAI(BaseChatModel):
):
if len(chunk["choices"]) == 0:
continue
- delta = chunk["choices"][0]["delta"]
- chunk = _convert_delta_to_message_chunk(delta, default_chunk_class)
+ choice = chunk["choices"][0]
+ chunk = _convert_delta_to_message_chunk(
+ choice["delta"], default_chunk_class
+ )
+ finish_reason = choice.get("finish_reason")
+ generation_info = (
+ dict(finish_reason=finish_reason) if finish_reason is not None else None
+ )
default_chunk_class = chunk.__class__
- yield ChatGenerationChunk(message=chunk)
+ yield ChatGenerationChunk(message=chunk, generation_info=generation_info)
if run_manager:
run_manager.on_llm_new_token(chunk.content)
From c19888c12c826a2e966a5138284821fab27e1586 Mon Sep 17 00:00:00 2001
From: Leonid Ganeline
Date: Wed, 23 Aug 2023 23:17:05 -0700
Subject: [PATCH 103/143] :hourglass_flowing_sand: docstrings: `vectorstores`
consistency (#9349)
:hourglass_flowing_sand:
- updated the top-level descriptions to a consistent format;
- changed several `ValueError` to `ImportError` in the import cases;
- changed the format of several internal functions from "name" to
"_name". So, these functions are not shown in the Top-level API
Reference page (with lists of classes/functions)
---
.../langchain/vectorstores/__init__.py | 28 +++++++++----------
.../vectorstores/_pgvector_data_models.py | 4 +++
.../vectorstores/alibabacloud_opensearch.py | 6 ++--
.../langchain/vectorstores/analyticdb.py | 3 +-
.../langchain/langchain/vectorstores/annoy.py | 5 ++--
.../langchain/langchain/vectorstores/atlas.py | 7 +++--
.../langchain/langchain/vectorstores/awadb.py | 5 ++--
.../langchain/vectorstores/azuresearch.py | 5 ++--
.../langchain/vectorstores/bageldb.py | 5 ++--
libs/langchain/langchain/vectorstores/base.py | 6 ++--
.../langchain/vectorstores/cassandra.py | 4 +--
.../langchain/vectorstores/chroma.py | 7 ++---
.../langchain/vectorstores/clarifai.py | 4 +--
.../langchain/vectorstores/clickhouse.py | 8 ++----
.../langchain/vectorstores/dashvector.py | 3 +-
.../langchain/vectorstores/deeplake.py | 9 +++---
.../langchain/langchain/vectorstores/dingo.py | 3 +-
.../langchain/vectorstores/docarray/base.py | 4 ++-
.../langchain/vectorstores/docarray/hnsw.py | 3 +-
.../vectorstores/docarray/in_memory.py | 2 +-
.../vectorstores/elastic_vector_search.py | 11 ++++----
.../langchain/vectorstores/elasticsearch.py | 13 ++++++---
.../langchain/langchain/vectorstores/faiss.py | 3 +-
.../langchain/vectorstores/hologres.py | 5 ++--
.../langchain/vectorstores/lancedb.py | 5 ++--
.../langchain/langchain/vectorstores/marqo.py | 3 +-
.../langchain/vectorstores/matching_engine.py | 3 +-
.../langchain/vectorstores/meilisearch.py | 3 +-
.../langchain/vectorstores/milvus.py | 10 +++----
.../langchain/vectorstores/mongodb_atlas.py | 2 +-
.../langchain/vectorstores/myscale.py | 11 ++++----
.../vectorstores/opensearch_vector_search.py | 11 ++++----
.../langchain/vectorstores/pgembedding.py | 12 +++++---
.../langchain/vectorstores/pgvector.py | 5 ++--
.../langchain/vectorstores/pinecone.py | 3 +-
.../langchain/vectorstores/qdrant.py | 7 ++---
.../langchain/langchain/vectorstores/redis.py | 8 ++----
.../langchain/vectorstores/rocksetdb.py | 3 +-
.../langchain/langchain/vectorstores/scann.py | 5 ++--
.../langchain/vectorstores/singlestoredb.py | 5 +---
.../langchain/vectorstores/sklearn.py | 10 +++----
.../langchain/vectorstores/starrocks.py | 6 ++--
.../langchain/vectorstores/supabase.py | 6 ++--
libs/langchain/langchain/vectorstores/tair.py | 3 +-
.../langchain/vectorstores/tigris.py | 6 ++--
.../langchain/vectorstores/typesense.py | 5 ++--
.../langchain/vectorstores/usearch.py | 4 +--
.../langchain/vectorstores/vectara.py | 5 ++--
.../langchain/vectorstores/weaviate.py | 5 ++--
libs/langchain/langchain/vectorstores/xata.py | 8 +++---
libs/langchain/langchain/vectorstores/zep.py | 9 +++---
.../langchain/vectorstores/zilliz.py | 4 +--
52 files changed, 150 insertions(+), 170 deletions(-)
diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py
index d815c5a65..d8e3664e0 100644
--- a/libs/langchain/langchain/vectorstores/__init__.py
+++ b/libs/langchain/langchain/vectorstores/__init__.py
@@ -78,60 +78,60 @@ __all__ = [
"AlibabaCloudOpenSearchSettings",
"AnalyticDB",
"Annoy",
+ "Annoy",
+ "AtlasDB",
"AtlasDB",
"AwaDB",
"AzureSearch",
"Bagel",
"Cassandra",
"Chroma",
+ "Chroma",
+ "Clarifai",
"Clickhouse",
"ClickhouseSettings",
"DashVector",
"DeepLake",
+ "DeepLake",
"Dingo",
"DocArrayHnswSearch",
"DocArrayInMemorySearch",
- "ElasticVectorSearch",
"ElasticKnnSearch",
+ "ElasticVectorSearch",
"ElasticsearchStore",
"Epsilla",
"FAISS",
- "PGEmbedding",
"Hologres",
"LanceDB",
- "MatchingEngine",
"Marqo",
+ "MatchingEngine",
"Meilisearch",
"Milvus",
- "Zilliz",
- "SingleStoreDB",
- "Chroma",
- "Clarifai",
- "OpenSearchVectorSearch",
- "AtlasDB",
- "DeepLake",
- "Annoy",
"MongoDBAtlasVectorSearch",
"MyScale",
"MyScaleSettings",
"OpenSearchVectorSearch",
+ "OpenSearchVectorSearch",
+ "PGEmbedding",
+ "PGVector",
"Pinecone",
"Qdrant",
"Redis",
"Rockset",
- "ScaNN",
"SKLearnVectorStore",
+ "ScaNN",
+ "SingleStoreDB",
"SingleStoreDB",
"StarRocks",
"SupabaseVectorStore",
"Tair",
"Tigris",
"Typesense",
+ "USearch",
"Vectara",
"VectorStore",
"Weaviate",
"ZepVectorStore",
"Zilliz",
- "PGVector",
- "USearch",
+ "Zilliz",
]
diff --git a/libs/langchain/langchain/vectorstores/_pgvector_data_models.py b/libs/langchain/langchain/vectorstores/_pgvector_data_models.py
index f44bd2e35..1a4b60776 100644
--- a/libs/langchain/langchain/vectorstores/_pgvector_data_models.py
+++ b/libs/langchain/langchain/vectorstores/_pgvector_data_models.py
@@ -9,6 +9,8 @@ from langchain.vectorstores.pgvector import BaseModel
class CollectionStore(BaseModel):
+ """Collection store."""
+
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
@@ -48,6 +50,8 @@ class CollectionStore(BaseModel):
class EmbeddingStore(BaseModel):
+ """Embedding store."""
+
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
diff --git a/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py b/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py
index f8cf664cc..672994bc3 100644
--- a/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py
+++ b/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py
@@ -12,7 +12,7 @@ logger = logging.getLogger()
class AlibabaCloudOpenSearchSettings:
- """Alibaba Cloud Opensearch Client Configuration.
+ """`Alibaba Cloud Opensearch` client configuration.
Attribute:
endpoint (str) : The endpoint of opensearch instance, You can find it
@@ -90,7 +90,7 @@ def create_metadata(fields: Dict[str, Any]) -> Dict[str, Any]:
class AlibabaCloudOpenSearch(VectorStore):
- """Alibaba Cloud OpenSearch Vector Store"""
+ """`Alibaba Cloud OpenSearch` vector store."""
def __init__(
self,
@@ -102,7 +102,7 @@ class AlibabaCloudOpenSearch(VectorStore):
from alibabacloud_ha3engine import client, models
from alibabacloud_tea_util import models as util_models
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import alibaba cloud opensearch python package. "
"Please install it with `pip install alibabacloud-ha3engine`."
)
diff --git a/libs/langchain/langchain/vectorstores/analyticdb.py b/libs/langchain/langchain/vectorstores/analyticdb.py
index b565a65a2..22e8dd18a 100644
--- a/libs/langchain/langchain/vectorstores/analyticdb.py
+++ b/libs/langchain/langchain/vectorstores/analyticdb.py
@@ -1,4 +1,3 @@
-"""VectorStore wrapper around a Postgres/PGVector database."""
from __future__ import annotations
import logging
@@ -25,7 +24,7 @@ Base = declarative_base() # type: Any
class AnalyticDB(VectorStore):
- """VectorStore implementation using AnalyticDB.
+ """`AnalyticDB` (distributed PostgreSQL) vector store.
AnalyticDB is a distributed full postgresql syntax cloud-native database.
- `connection_string` is a postgres connection string.
diff --git a/libs/langchain/langchain/vectorstores/annoy.py b/libs/langchain/langchain/vectorstores/annoy.py
index f4b7cecf0..22c569781 100644
--- a/libs/langchain/langchain/vectorstores/annoy.py
+++ b/libs/langchain/langchain/vectorstores/annoy.py
@@ -1,4 +1,3 @@
-"""Wrapper around Annoy vector database."""
from __future__ import annotations
import os
@@ -26,7 +25,7 @@ def dependable_annoy_import() -> Any:
try:
import annoy
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import annoy python package. "
"Please install it with `pip install --user annoy` "
)
@@ -34,7 +33,7 @@ def dependable_annoy_import() -> Any:
class Annoy(VectorStore):
- """Wrapper around Annoy vector database.
+ """`Annoy` vector store.
To use, you should have the ``annoy`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/atlas.py b/libs/langchain/langchain/vectorstores/atlas.py
index 31bccb6b4..bdc86e17e 100644
--- a/libs/langchain/langchain/vectorstores/atlas.py
+++ b/libs/langchain/langchain/vectorstores/atlas.py
@@ -1,4 +1,3 @@
-"""Wrapper around Atlas by Nomic."""
from __future__ import annotations
import logging
@@ -15,7 +14,9 @@ logger = logging.getLogger(__name__)
class AtlasDB(VectorStore):
- """Wrapper around Atlas: Nomic's neural database and rhizomatic instrument.
+ """`Atlas` vector store.
+
+ Atlas is the `Nomic's` neural database and `rhizomatic` instrument.
To use, you should have the ``nomic`` python package installed.
@@ -61,7 +62,7 @@ class AtlasDB(VectorStore):
import nomic
from nomic import AtlasProject
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import nomic python package. "
"Please install it with `pip install nomic`."
)
diff --git a/libs/langchain/langchain/vectorstores/awadb.py b/libs/langchain/langchain/vectorstores/awadb.py
index 44317b937..7f5daf171 100644
--- a/libs/langchain/langchain/vectorstores/awadb.py
+++ b/libs/langchain/langchain/vectorstores/awadb.py
@@ -1,4 +1,3 @@
-"""Wrapper around AwaDB for embedding vectors"""
from __future__ import annotations
import logging
@@ -20,7 +19,7 @@ DEFAULT_TOPN = 4
class AwaDB(VectorStore):
- """Interface implemented by AwaDB vector stores."""
+ """`AwaDB` vector store."""
_DEFAULT_TABLE_NAME = "langchain_awadb"
@@ -50,7 +49,7 @@ class AwaDB(VectorStore):
try:
import awadb
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import awadb python package. "
"Please install it with `pip install awadb`."
)
diff --git a/libs/langchain/langchain/vectorstores/azuresearch.py b/libs/langchain/langchain/vectorstores/azuresearch.py
index 5f2a45e93..32b6d03f1 100644
--- a/libs/langchain/langchain/vectorstores/azuresearch.py
+++ b/libs/langchain/langchain/vectorstores/azuresearch.py
@@ -1,4 +1,3 @@
-"""Wrapper around Azure Cognitive Search."""
from __future__ import annotations
import base64
@@ -177,7 +176,7 @@ def _get_search_client(
class AzureSearch(VectorStore):
- """Azure Cognitive Search vector store."""
+ """`Azure Cognitive Search` vector store."""
def __init__(
self,
@@ -526,7 +525,7 @@ class AzureSearch(VectorStore):
class AzureSearchVectorStoreRetriever(BaseRetriever):
- """Retriever that uses Azure Search to find similar documents."""
+ """Retriever that uses `Azure Cognitive Search`."""
vectorstore: AzureSearch
"""Azure Search instance used to find similar documents."""
diff --git a/libs/langchain/langchain/vectorstores/bageldb.py b/libs/langchain/langchain/vectorstores/bageldb.py
index ab0e7868c..5b4590933 100644
--- a/libs/langchain/langchain/vectorstores/bageldb.py
+++ b/libs/langchain/langchain/vectorstores/bageldb.py
@@ -1,4 +1,3 @@
-"""BagelDB integration"""
from __future__ import annotations
import uuid
@@ -43,7 +42,7 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
class Bagel(VectorStore):
- """Wrapper around BagelDB.ai vector store.
+ """``BagelDB.ai`` vector store.
To use, you should have the ``betabageldb`` python package installed.
@@ -70,7 +69,7 @@ class Bagel(VectorStore):
import bagel
import bagel.config
except ImportError:
- raise ValueError("Please install bagel `pip install betabageldb`.")
+ raise ImportError("Please install bagel `pip install betabageldb`.")
if client is not None:
self._client_settings = client_settings
self._client = client
diff --git a/libs/langchain/langchain/vectorstores/base.py b/libs/langchain/langchain/vectorstores/base.py
index 5bc624add..2c9ce74ee 100644
--- a/libs/langchain/langchain/vectorstores/base.py
+++ b/libs/langchain/langchain/vectorstores/base.py
@@ -1,5 +1,3 @@
-"""Interface for vector stores."""
-
from __future__ import annotations
import asyncio
@@ -37,7 +35,7 @@ VST = TypeVar("VST", bound="VectorStore")
class VectorStore(ABC):
- """Interface for vector stores."""
+ """Interface for vector store."""
@abstractmethod
def add_texts(
@@ -520,7 +518,7 @@ class VectorStore(ABC):
class VectorStoreRetriever(BaseRetriever):
- """Retriever class for VectorStore."""
+ """Base Retriever class for VectorStore."""
vectorstore: VectorStore
"""VectorStore to use for retrieval."""
diff --git a/libs/langchain/langchain/vectorstores/cassandra.py b/libs/langchain/langchain/vectorstores/cassandra.py
index d844a89e1..cc6541b5f 100644
--- a/libs/langchain/langchain/vectorstores/cassandra.py
+++ b/libs/langchain/langchain/vectorstores/cassandra.py
@@ -1,4 +1,3 @@
-"""Wrapper around Cassandra vector-store capabilities, based on cassIO."""
from __future__ import annotations
import typing
@@ -19,8 +18,9 @@ CVST = TypeVar("CVST", bound="Cassandra")
class Cassandra(VectorStore):
- """Wrapper around Cassandra embeddings platform.
+ """`Cassandra` vector store.
+ It based on the Cassandra vector-store capabilities, based on cassIO.
There is no notion of a default table name, since each embedding
function implies its own vector dimension, which is part of the schema.
diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py
index 9457419c5..76469357a 100644
--- a/libs/langchain/langchain/vectorstores/chroma.py
+++ b/libs/langchain/langchain/vectorstores/chroma.py
@@ -1,4 +1,3 @@
-"""Wrapper around ChromaDB embeddings platform."""
from __future__ import annotations
import logging
@@ -50,7 +49,7 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
class Chroma(VectorStore):
- """Wrapper around ChromaDB embeddings platform.
+ """`ChromaDB` vector store.
To use, you should have the ``chromadb`` python package installed.
@@ -76,12 +75,12 @@ class Chroma(VectorStore):
client: Optional[chromadb.Client] = None,
relevance_score_fn: Optional[Callable[[float], float]] = None,
) -> None:
- """Initialize with Chroma client."""
+ """Initialize with a Chroma client."""
try:
import chromadb
import chromadb.config
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import chromadb python package. "
"Please install it with `pip install chromadb`."
)
diff --git a/libs/langchain/langchain/vectorstores/clarifai.py b/libs/langchain/langchain/vectorstores/clarifai.py
index 9db4408b2..51d01118f 100644
--- a/libs/langchain/langchain/vectorstores/clarifai.py
+++ b/libs/langchain/langchain/vectorstores/clarifai.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
class Clarifai(VectorStore):
- """Wrapper around Clarifai AI platform's vector store.
+ """`Clarifai AI` vector store.
To use, you should have the ``clarifai`` python package installed.
@@ -55,7 +55,7 @@ class Clarifai(VectorStore):
from clarifai.auth.helper import DEFAULT_BASE, ClarifaiAuthHelper
from clarifai.client import create_stub
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import clarifai python package. "
"Please install it with `pip install clarifai`."
)
diff --git a/libs/langchain/langchain/vectorstores/clickhouse.py b/libs/langchain/langchain/vectorstores/clickhouse.py
index c0916ec44..68de42963 100644
--- a/libs/langchain/langchain/vectorstores/clickhouse.py
+++ b/libs/langchain/langchain/vectorstores/clickhouse.py
@@ -1,5 +1,3 @@
-"""Wrapper around open source ClickHouse VectorSearch capability."""
-
from __future__ import annotations
import json
@@ -33,7 +31,7 @@ def has_mul_sub_str(s: str, *args: Any) -> bool:
class ClickhouseSettings(BaseSettings):
- """ClickHouse Client Configuration
+ """`ClickHouse` client configuration.
Attribute:
clickhouse_host (str) : An URL to connect to MyScale backend.
@@ -101,7 +99,7 @@ class ClickhouseSettings(BaseSettings):
class Clickhouse(VectorStore):
- """Wrapper around ClickHouse vector database
+ """`ClickHouse VectorSearch` vector store.
You need a `clickhouse-connect` python package, and a valid account
to connect to ClickHouse.
@@ -130,7 +128,7 @@ class Clickhouse(VectorStore):
try:
from clickhouse_connect import get_client
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import clickhouse connect python package. "
"Please install it with `pip install clickhouse-connect`."
)
diff --git a/libs/langchain/langchain/vectorstores/dashvector.py b/libs/langchain/langchain/vectorstores/dashvector.py
index 714c76c00..f3559384a 100644
--- a/libs/langchain/langchain/vectorstores/dashvector.py
+++ b/libs/langchain/langchain/vectorstores/dashvector.py
@@ -1,4 +1,3 @@
-"""Wrapper around DashVector vector database."""
from __future__ import annotations
import logging
@@ -23,7 +22,7 @@ logger = logging.getLogger(__name__)
class DashVector(VectorStore):
- """Wrapper around DashVector vector database.
+ """`DashVector` vector store.
To use, you should have the ``dashvector`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/deeplake.py b/libs/langchain/langchain/vectorstores/deeplake.py
index c04d71ace..ed7601832 100644
--- a/libs/langchain/langchain/vectorstores/deeplake.py
+++ b/libs/langchain/langchain/vectorstores/deeplake.py
@@ -1,4 +1,3 @@
-"""Wrapper around Activeloop Deep Lake."""
from __future__ import annotations
import logging
@@ -24,9 +23,9 @@ logger = logging.getLogger(__name__)
class DeepLake(VectorStore):
- """Wrapper around Deep Lake, a data lake for deep learning applications.
+ """`Activeloop Deep Lake` vector store.
- We integrated deeplake's similarity search and filtering for fast prototyping,
+ We integrated deeplake's similarity search and filtering for fast prototyping.
Now, it supports Tensor Query Language (TQL) for production use cases
over billion rows.
@@ -126,7 +125,7 @@ class DeepLake(VectorStore):
self.verbose = verbose
if _DEEPLAKE_INSTALLED is False:
- raise ValueError(
+ raise ImportError(
"Could not import deeplake python package. "
"Please install it with `pip install deeplake[enterprise]`."
)
@@ -135,7 +134,7 @@ class DeepLake(VectorStore):
kwargs.get("runtime") == {"tensor_db": True}
and version_compare(deeplake.__version__, "3.6.7") == -1
):
- raise ValueError(
+ raise ImportError(
"To use tensor_db option you need to update deeplake to `3.6.7`. "
f"Currently installed deeplake version is {deeplake.__version__}. "
)
diff --git a/libs/langchain/langchain/vectorstores/dingo.py b/libs/langchain/langchain/vectorstores/dingo.py
index 44765cdd3..e6d3419c7 100644
--- a/libs/langchain/langchain/vectorstores/dingo.py
+++ b/libs/langchain/langchain/vectorstores/dingo.py
@@ -1,4 +1,3 @@
-"""Wrapper around the Dingo vector database."""
from __future__ import annotations
import logging
@@ -16,7 +15,7 @@ logger = logging.getLogger(__name__)
class Dingo(VectorStore):
- """Wrapper around Dingo vector database.
+ """`Dingo` vector store.
To use, you should have the ``dingodb`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/docarray/base.py b/libs/langchain/langchain/vectorstores/docarray/base.py
index 2e693bcde..d3b048610 100644
--- a/libs/langchain/langchain/vectorstores/docarray/base.py
+++ b/libs/langchain/langchain/vectorstores/docarray/base.py
@@ -33,6 +33,8 @@ def _check_docarray_import() -> None:
class DocArrayIndex(VectorStore, ABC):
+ """Base class for `DocArray` based vector stores."""
+
def __init__(
self,
doc_index: "BaseDocIndex",
@@ -67,7 +69,7 @@ class DocArrayIndex(VectorStore, ABC):
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
- """Run more texts through the embeddings and add to the vectorstore.
+ """Embed texts and add to the vector store.
Args:
texts: Iterable of strings to add to the vectorstore.
diff --git a/libs/langchain/langchain/vectorstores/docarray/hnsw.py b/libs/langchain/langchain/vectorstores/docarray/hnsw.py
index 26ed4afac..e1fdc6f07 100644
--- a/libs/langchain/langchain/vectorstores/docarray/hnsw.py
+++ b/libs/langchain/langchain/vectorstores/docarray/hnsw.py
@@ -1,4 +1,3 @@
-"""Wrapper around Hnswlib store."""
from __future__ import annotations
from typing import Any, List, Literal, Optional
@@ -11,7 +10,7 @@ from langchain.vectorstores.docarray.base import (
class DocArrayHnswSearch(DocArrayIndex):
- """Wrapper around HnswLib storage.
+ """`HnswLib` storage using `DocArray` package.
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
You can install it with `pip install "langchain[docarray]"`.
diff --git a/libs/langchain/langchain/vectorstores/docarray/in_memory.py b/libs/langchain/langchain/vectorstores/docarray/in_memory.py
index 77570c0b0..9c3b3dcf3 100644
--- a/libs/langchain/langchain/vectorstores/docarray/in_memory.py
+++ b/libs/langchain/langchain/vectorstores/docarray/in_memory.py
@@ -11,7 +11,7 @@ from langchain.vectorstores.docarray.base import (
class DocArrayInMemorySearch(DocArrayIndex):
- """Wrapper around in-memory storage for exact search.
+ """In-memory `DocArray` storage for exact search.
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
You can install it with `pip install "langchain[docarray]"`.
diff --git a/libs/langchain/langchain/vectorstores/elastic_vector_search.py b/libs/langchain/langchain/vectorstores/elastic_vector_search.py
index f3b8c37e0..978da1d5c 100644
--- a/libs/langchain/langchain/vectorstores/elastic_vector_search.py
+++ b/libs/langchain/langchain/vectorstores/elastic_vector_search.py
@@ -53,9 +53,9 @@ def _default_script_query(query_vector: List[float], filter: Optional[dict]) ->
@deprecated("0.0.265", alternative="ElasticsearchStore class.", pending=True)
class ElasticVectorSearch(VectorStore):
- """Wrapper around Elasticsearch as a vector database.
+ """[DEPRECATED] `Elasticsearch` vector store.
- To connect to an Elasticsearch instance that does not require
+ To connect to an `Elasticsearch` instance that does not require
login credentials, pass the Elasticsearch URL and index name along with the
embedding object to the constructor.
@@ -340,11 +340,10 @@ class ElasticVectorSearch(VectorStore):
class ElasticKnnSearch(VectorStore):
- """
- ElasticKnnSearch is a class for performing k-nearest neighbor
- (k-NN) searches on text data using Elasticsearch.
+ """[DEPRECATED] `Elasticsearch` with k-nearest neighbor search
+ (`k-NN`) vector store.
- This class is used to create an Elasticsearch index of text data that
+ It creates an Elasticsearch index of text data that
can be searched using k-NN search. The text data is transformed into
vector embeddings using a provided embedding model, and these embeddings
are stored in the Elasticsearch index.
diff --git a/libs/langchain/langchain/vectorstores/elasticsearch.py b/libs/langchain/langchain/vectorstores/elasticsearch.py
index 27f0ef307..d2dd7f768 100644
--- a/libs/langchain/langchain/vectorstores/elasticsearch.py
+++ b/libs/langchain/langchain/vectorstores/elasticsearch.py
@@ -1,5 +1,3 @@
-"""Wrapper around Elasticsearch vector database."""
-
import logging
import uuid
from abc import ABC, abstractmethod
@@ -28,6 +26,8 @@ logger = logging.getLogger(__name__)
class BaseRetrievalStrategy(ABC):
+ """Base class for `Elasticsearch` retrieval strategies."""
+
@abstractmethod
def query(
self,
@@ -109,6 +109,8 @@ class BaseRetrievalStrategy(ABC):
class ApproxRetrievalStrategy(BaseRetrievalStrategy):
+ """Approximate retrieval strategy using the `HNSW` algorithm."""
+
def __init__(
self,
query_model_id: Optional[str] = None,
@@ -211,6 +213,8 @@ class ApproxRetrievalStrategy(BaseRetrievalStrategy):
class ExactRetrievalStrategy(BaseRetrievalStrategy):
+ """Exact retrieval strategy using the `script_score` query."""
+
def query(
self,
query_vector: Union[List[float], None],
@@ -276,6 +280,8 @@ class ExactRetrievalStrategy(BaseRetrievalStrategy):
class SparseRetrievalStrategy(BaseRetrievalStrategy):
+ """Sparse retrieval strategy using the `text_expansion` processor."""
+
def __init__(self, model_id: Optional[str] = None):
self.model_id = model_id or ".elser_model_1"
@@ -355,8 +361,7 @@ class SparseRetrievalStrategy(BaseRetrievalStrategy):
class ElasticsearchStore(VectorStore):
-
- """Wrapper around Elasticsearch search database.
+ """`Elasticsearch` vector store.
Example:
.. code-block:: python
diff --git a/libs/langchain/langchain/vectorstores/faiss.py b/libs/langchain/langchain/vectorstores/faiss.py
index 7e9cb109e..a8c835e5c 100644
--- a/libs/langchain/langchain/vectorstores/faiss.py
+++ b/libs/langchain/langchain/vectorstores/faiss.py
@@ -1,4 +1,3 @@
-"""Wrapper around FAISS vector database."""
from __future__ import annotations
import operator
@@ -65,7 +64,7 @@ def _len_check_if_sized(x: Any, y: Any, x_name: str, y_name: str) -> None:
class FAISS(VectorStore):
- """Wrapper around FAISS vector database.
+ """`Meta Faiss` vector store.
To use, you must have the ``faiss`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/hologres.py b/libs/langchain/langchain/vectorstores/hologres.py
index 092dc24c3..ce816e8f9 100644
--- a/libs/langchain/langchain/vectorstores/hologres.py
+++ b/libs/langchain/langchain/vectorstores/hologres.py
@@ -1,4 +1,3 @@
-"""VectorStore wrapper around a Hologres database."""
from __future__ import annotations
import json
@@ -16,7 +15,7 @@ _LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding"
class HologresWrapper:
- """Wrapper around Hologres service."""
+ """`Hologres API` wrapper."""
def __init__(self, connection_string: str, ndims: int, table_name: str) -> None:
"""Initialize the wrapper.
@@ -114,7 +113,7 @@ document text);"""
class Hologres(VectorStore):
- """VectorStore implementation using Hologres.
+ """`Hologres API` vector store.
- `connection_string` is a hologres connection string.
- `embedding_function` any embedding function implementing
diff --git a/libs/langchain/langchain/vectorstores/lancedb.py b/libs/langchain/langchain/vectorstores/lancedb.py
index 2b29f92c7..1a166d4e1 100644
--- a/libs/langchain/langchain/vectorstores/lancedb.py
+++ b/libs/langchain/langchain/vectorstores/lancedb.py
@@ -1,4 +1,3 @@
-"""Wrapper around LanceDB vector database"""
from __future__ import annotations
import uuid
@@ -10,7 +9,7 @@ from langchain.vectorstores.base import VectorStore
class LanceDB(VectorStore):
- """Wrapper around LanceDB vector database.
+ """`LanceDB` vector store.
To use, you should have ``lancedb`` python package installed.
@@ -36,7 +35,7 @@ class LanceDB(VectorStore):
try:
import lancedb
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import lancedb python package. "
"Please install it with `pip install lancedb`."
)
diff --git a/libs/langchain/langchain/vectorstores/marqo.py b/libs/langchain/langchain/vectorstores/marqo.py
index 0d1dfe048..b18731e08 100644
--- a/libs/langchain/langchain/vectorstores/marqo.py
+++ b/libs/langchain/langchain/vectorstores/marqo.py
@@ -1,4 +1,3 @@
-"""Wrapper around weaviate vector database."""
from __future__ import annotations
import json
@@ -25,7 +24,7 @@ if TYPE_CHECKING:
class Marqo(VectorStore):
- """Wrapper around Marqo database.
+ """`Marqo` vector store.
Marqo indexes have their own models associated with them to generate your
embeddings. This means that you can selected from a range of different models
diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py
index 9d8c55ed4..2bf27af64 100644
--- a/libs/langchain/langchain/vectorstores/matching_engine.py
+++ b/libs/langchain/langchain/vectorstores/matching_engine.py
@@ -1,4 +1,3 @@
-"""Vertex Matching Engine implementation of the vector store."""
from __future__ import annotations
import json
@@ -21,7 +20,7 @@ logger = logging.getLogger()
class MatchingEngine(VectorStore):
- """Vertex Matching Engine implementation of the vector store.
+ """`Google Vertex AI Matching Engine` vector store.
While the embeddings are stored in the Matching Engine, the embedded
documents will be stored in GCS.
diff --git a/libs/langchain/langchain/vectorstores/meilisearch.py b/libs/langchain/langchain/vectorstores/meilisearch.py
index 313ae6816..05ffb9700 100644
--- a/libs/langchain/langchain/vectorstores/meilisearch.py
+++ b/libs/langchain/langchain/vectorstores/meilisearch.py
@@ -1,4 +1,3 @@
-"""Wrapper around Meilisearch vector database."""
from __future__ import annotations
import uuid
@@ -45,7 +44,7 @@ def _create_client(
class Meilisearch(VectorStore):
- """Initialize wrapper around Meilisearch vector database.
+ """`Meilisearch` vector store.
To use this, you need to have `meilisearch` python package installed,
and a running Meilisearch instance.
diff --git a/libs/langchain/langchain/vectorstores/milvus.py b/libs/langchain/langchain/vectorstores/milvus.py
index c6a5301ff..e5ee2a015 100644
--- a/libs/langchain/langchain/vectorstores/milvus.py
+++ b/libs/langchain/langchain/vectorstores/milvus.py
@@ -1,4 +1,3 @@
-"""Wrapper around the Milvus vector database."""
from __future__ import annotations
import logging
@@ -24,19 +23,18 @@ DEFAULT_MILVUS_CONNECTION = {
class Milvus(VectorStore):
- """Initialize wrapper around the milvus vector database.
+ """`Milvus` vector store.
- In order to use this you need to have `pymilvus` installed and a
- running Milvus
+ You need to install `pymilvus` and run Milvus.
See the following documentation for how to run a Milvus instance:
https://milvus.io/docs/install_standalone-docker.md
If looking for a hosted Milvus, take a look at this documentation:
https://zilliz.com/cloud and make use of the Zilliz vectorstore found in
- this project,
+ this project.
- IF USING L2/IP metric IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA.
+ IF USING L2/IP metric, IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA.
Args:
embedding_function (Embeddings): Function used to embed the text.
diff --git a/libs/langchain/langchain/vectorstores/mongodb_atlas.py b/libs/langchain/langchain/vectorstores/mongodb_atlas.py
index b7cd5136d..3476c0eb5 100644
--- a/libs/langchain/langchain/vectorstores/mongodb_atlas.py
+++ b/libs/langchain/langchain/vectorstores/mongodb_atlas.py
@@ -32,7 +32,7 @@ DEFAULT_INSERT_BATCH_SIZE = 100
class MongoDBAtlasVectorSearch(VectorStore):
- """Wrapper around MongoDB Atlas Vector Search.
+ """`MongoDB Atlas Vector Search` vector store.
To use, you should have both:
- the ``pymongo`` python package installed
diff --git a/libs/langchain/langchain/vectorstores/myscale.py b/libs/langchain/langchain/vectorstores/myscale.py
index ef4db34ea..3c4361fcd 100644
--- a/libs/langchain/langchain/vectorstores/myscale.py
+++ b/libs/langchain/langchain/vectorstores/myscale.py
@@ -1,4 +1,3 @@
-"""Wrapper around MyScale vector database."""
from __future__ import annotations
import json
@@ -32,7 +31,7 @@ def has_mul_sub_str(s: str, *args: Any) -> bool:
class MyScaleSettings(BaseSettings):
- """MyScale Client Configuration
+ """MyScale client configuration.
Attribute:
myscale_host (str) : An URL to connect to MyScale backend.
@@ -93,13 +92,13 @@ class MyScaleSettings(BaseSettings):
class MyScale(VectorStore):
- """Wrapper around MyScale vector database
+ """`MyScale` vector store.
You need a `clickhouse-connect` python package, and a valid account
to connect to MyScale.
- MyScale can not only search with simple vector indexes,
- it also supports complex query with multiple conditions,
+ MyScale can not only search with simple vector indexes.
+ It also supports a complex query with multiple conditions,
constraints and even sub-queries.
For more information, please visit
@@ -122,7 +121,7 @@ class MyScale(VectorStore):
try:
from clickhouse_connect import get_client
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import clickhouse connect python package. "
"Please install it with `pip install clickhouse-connect`."
)
diff --git a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py
index 8e84835df..ddc2e72ac 100644
--- a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py
+++ b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py
@@ -1,4 +1,3 @@
-"""Wrapper around OpenSearch vector database."""
from __future__ import annotations
import uuid
@@ -26,7 +25,7 @@ def _import_opensearch() -> Any:
try:
from opensearchpy import OpenSearch
except ImportError:
- raise ValueError(IMPORT_OPENSEARCH_PY_ERROR)
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
return OpenSearch
@@ -35,7 +34,7 @@ def _import_bulk() -> Any:
try:
from opensearchpy.helpers import bulk
except ImportError:
- raise ValueError(IMPORT_OPENSEARCH_PY_ERROR)
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
return bulk
@@ -44,7 +43,7 @@ def _import_not_found_error() -> Any:
try:
from opensearchpy.exceptions import NotFoundError
except ImportError:
- raise ValueError(IMPORT_OPENSEARCH_PY_ERROR)
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
return NotFoundError
@@ -54,7 +53,7 @@ def _get_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
opensearch = _import_opensearch()
client = opensearch(opensearch_url, **kwargs)
except ValueError as e:
- raise ValueError(
+ raise ImportError(
f"OpenSearch client string provided is not in proper format. "
f"Got error: {e} "
)
@@ -315,7 +314,7 @@ def _get_kwargs_value(kwargs: Any, key: str, default_value: Any) -> Any:
class OpenSearchVectorSearch(VectorStore):
- """Wrapper around OpenSearch as a vector database.
+ """`Amazon OpenSearch Vector Engine` vector store.
Example:
.. code-block:: python
diff --git a/libs/langchain/langchain/vectorstores/pgembedding.py b/libs/langchain/langchain/vectorstores/pgembedding.py
index 4c820636c..ecdb20a8f 100644
--- a/libs/langchain/langchain/vectorstores/pgembedding.py
+++ b/libs/langchain/langchain/vectorstores/pgembedding.py
@@ -1,4 +1,3 @@
-"""VectorStore wrapper around a Postgres database."""
from __future__ import annotations
import logging
@@ -23,11 +22,15 @@ _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
class BaseModel(Base):
+ """Base model for all SQL stores."""
+
__abstract__ = True
uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
class CollectionStore(BaseModel):
+ """Collection store."""
+
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
@@ -67,6 +70,8 @@ class CollectionStore(BaseModel):
class EmbeddingStore(BaseModel):
+ """Embedding store."""
+
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
@@ -87,15 +92,14 @@ class EmbeddingStore(BaseModel):
class QueryResult:
- """QueryResult is a result from a query."""
+ """Result from a query."""
EmbeddingStore: EmbeddingStore
distance: float
class PGEmbedding(VectorStore):
- """
- VectorStore implementation using Postgres and the pg_embedding extension.
+ """`Postgres` with the `pg_embedding` extension as a vector store.
pg_embedding uses sequential scan by default. but you can create a HNSW index
using the create_hnsw_index method.
diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py
index 9581bdc3b..a86a88cb1 100644
--- a/libs/langchain/langchain/vectorstores/pgvector.py
+++ b/libs/langchain/langchain/vectorstores/pgvector.py
@@ -1,4 +1,3 @@
-"""VectorStore wrapper around a Postgres/PGVector database."""
from __future__ import annotations
import enum
@@ -46,12 +45,14 @@ _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
class BaseModel(Base):
+ """Base model for the SQL stores."""
+
__abstract__ = True
uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
class PGVector(VectorStore):
- """VectorStore implementation using Postgres and pgvector.
+ """`Postgres`/`PGVector` vector store.
To use, you should have the ``pgvector`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/pinecone.py b/libs/langchain/langchain/vectorstores/pinecone.py
index 6ecf0b291..c6793b4de 100644
--- a/libs/langchain/langchain/vectorstores/pinecone.py
+++ b/libs/langchain/langchain/vectorstores/pinecone.py
@@ -1,4 +1,3 @@
-"""Wrapper around Pinecone vector database."""
from __future__ import annotations
import logging
@@ -17,7 +16,7 @@ logger = logging.getLogger(__name__)
class Pinecone(VectorStore):
- """Wrapper around Pinecone vector database.
+ """`Pinecone` vector store.
To use, you should have the ``pinecone-client`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/qdrant.py b/libs/langchain/langchain/vectorstores/qdrant.py
index f18b2cc91..7b9d9869c 100644
--- a/libs/langchain/langchain/vectorstores/qdrant.py
+++ b/libs/langchain/langchain/vectorstores/qdrant.py
@@ -1,4 +1,3 @@
-"""Wrapper around Qdrant vector database."""
from __future__ import annotations
import asyncio
@@ -40,7 +39,7 @@ if TYPE_CHECKING:
class QdrantException(Exception):
- """Base class for all the Qdrant related exceptions"""
+ """`Qdrant` related exceptions."""
def sync_call_fallback(method: Callable) -> Callable:
@@ -68,7 +67,7 @@ def sync_call_fallback(method: Callable) -> Callable:
class Qdrant(VectorStore):
- """Wrapper around Qdrant vector database.
+ """`Qdrant` vector store.
To use you should have the ``qdrant-client`` package installed.
@@ -102,7 +101,7 @@ class Qdrant(VectorStore):
try:
import qdrant_client
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import qdrant-client python package. "
"Please install it with `pip install qdrant-client`."
)
diff --git a/libs/langchain/langchain/vectorstores/redis.py b/libs/langchain/langchain/vectorstores/redis.py
index 198e5d706..56429bea8 100644
--- a/libs/langchain/langchain/vectorstores/redis.py
+++ b/libs/langchain/langchain/vectorstores/redis.py
@@ -1,5 +1,3 @@
-"""Wrapper around Redis vector database."""
-
from __future__ import annotations
import json
@@ -96,7 +94,7 @@ def _default_relevance_score(val: float) -> float:
class Redis(VectorStore):
- """Wrapper around Redis vector database.
+ """`Redis` vector store.
To use, you should have the ``redis`` python package installed.
@@ -184,7 +182,7 @@ class Redis(VectorStore):
from redis.commands.search.field import TextField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import redis python package. "
"Please install it with `pip install redis`."
)
@@ -612,7 +610,7 @@ class Redis(VectorStore):
class RedisVectorStoreRetriever(VectorStoreRetriever):
- """Retriever for Redis VectorStore."""
+ """Retriever for `Redis` vector store."""
vectorstore: Redis
"""Redis VectorStore."""
diff --git a/libs/langchain/langchain/vectorstores/rocksetdb.py b/libs/langchain/langchain/vectorstores/rocksetdb.py
index 5b2fc234a..674473774 100644
--- a/libs/langchain/langchain/vectorstores/rocksetdb.py
+++ b/libs/langchain/langchain/vectorstores/rocksetdb.py
@@ -1,4 +1,3 @@
-"""Wrapper around Rockset vector database."""
from __future__ import annotations
import logging
@@ -13,7 +12,7 @@ logger = logging.getLogger(__name__)
class Rockset(VectorStore):
- """Wrapper arpund Rockset vector database.
+ """`Rockset` vector store.
To use, you should have the `rockset` python package installed. Note that to use
this, the collection being used must already exist in your Rockset instance.
diff --git a/libs/langchain/langchain/vectorstores/scann.py b/libs/langchain/langchain/vectorstores/scann.py
index a1ce4af48..434080e7e 100644
--- a/libs/langchain/langchain/vectorstores/scann.py
+++ b/libs/langchain/langchain/vectorstores/scann.py
@@ -1,4 +1,3 @@
-"""Wrapper around ScaNN vector database."""
from __future__ import annotations
import operator
@@ -25,7 +24,7 @@ def normalize(x: np.ndarray) -> np.ndarray:
def dependable_scann_import() -> Any:
"""
- Import scann if available, otherwise raise error.
+ Import `scann` if available, otherwise raise error.
"""
try:
import scann
@@ -38,7 +37,7 @@ def dependable_scann_import() -> Any:
class ScaNN(VectorStore):
- """Wrapper around ScaNN vector database.
+ """`ScaNN` vector store.
To use, you should have the ``scann`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/singlestoredb.py b/libs/langchain/langchain/vectorstores/singlestoredb.py
index 7c05778c8..983f3f7f0 100644
--- a/libs/langchain/langchain/vectorstores/singlestoredb.py
+++ b/libs/langchain/langchain/vectorstores/singlestoredb.py
@@ -1,5 +1,3 @@
-"""Wrapper around SingleStore DB."""
-
from __future__ import annotations
import json
@@ -35,8 +33,7 @@ ORDERING_DIRECTIVE: dict = {
class SingleStoreDB(VectorStore):
- """
- This class serves as a Pythonic interface to the SingleStore DB database.
+ """`SingleStore DB` vector store.
The prerequisite for using this class is the installation of the ``singlestoredb``
Python package.
diff --git a/libs/langchain/langchain/vectorstores/sklearn.py b/libs/langchain/langchain/vectorstores/sklearn.py
index dcc6237c2..d4f49c965 100644
--- a/libs/langchain/langchain/vectorstores/sklearn.py
+++ b/libs/langchain/langchain/vectorstores/sklearn.py
@@ -21,7 +21,7 @@ DEFAULT_FETCH_K = 20 # Number of Documents to initially fetch during MMR search
class BaseSerializer(ABC):
- """Abstract base class for saving and loading data."""
+ """Base class for serializing data."""
def __init__(self, persist_path: str) -> None:
self.persist_path = persist_path
@@ -57,7 +57,7 @@ class JsonSerializer(BaseSerializer):
class BsonSerializer(BaseSerializer):
- """Serializes data in binary json using the bson python package."""
+ """Serializes data in binary json using the `bson` python package."""
def __init__(self, persist_path: str) -> None:
super().__init__(persist_path)
@@ -77,7 +77,7 @@ class BsonSerializer(BaseSerializer):
class ParquetSerializer(BaseSerializer):
- """Serializes data in Apache Parquet format using the pyarrow package."""
+ """Serializes data in `Apache Parquet` format using the `pyarrow` package."""
def __init__(self, persist_path: str) -> None:
super().__init__(persist_path)
@@ -125,8 +125,8 @@ class SKLearnVectorStoreException(RuntimeError):
class SKLearnVectorStore(VectorStore):
- """A simple in-memory vector store based on the scikit-learn library
- NearestNeighbors implementation."""
+ """Simple in-memory vector store based on the `scikit-learn` library
+ `NearestNeighbors` implementation."""
def __init__(
self,
diff --git a/libs/langchain/langchain/vectorstores/starrocks.py b/libs/langchain/langchain/vectorstores/starrocks.py
index d6abc40af..a172d1bbb 100644
--- a/libs/langchain/langchain/vectorstores/starrocks.py
+++ b/libs/langchain/langchain/vectorstores/starrocks.py
@@ -1,5 +1,3 @@
-"""Wrapper around open source StarRocks VectorSearch capability."""
-
from __future__ import annotations
import json
@@ -69,7 +67,7 @@ def get_named_result(connection: Any, query: str) -> List[dict[str, Any]]:
class StarRocksSettings(BaseSettings):
- """StarRocks Client Configuration
+ """StarRocks client configuration.
Attribute:
StarRocks_host (str) : An URL to connect to MyScale backend.
@@ -121,7 +119,7 @@ class StarRocksSettings(BaseSettings):
class StarRocks(VectorStore):
- """Wrapper around StarRocks vector database
+ """`StarRocks` vector store.
You need a `pymysql` python package, and a valid account
to connect to StarRocks.
diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py
index a0f918309..d91196534 100644
--- a/libs/langchain/langchain/vectorstores/supabase.py
+++ b/libs/langchain/langchain/vectorstores/supabase.py
@@ -26,7 +26,9 @@ if TYPE_CHECKING:
class SupabaseVectorStore(VectorStore):
- """VectorStore for a Supabase postgres database. Assumes you have the `pgvector`
+ """`Supabase Postgres` vector store.
+
+ It assumes you have the `pgvector`
extension installed and a `match_documents` (or similar) function. For more details:
https://integrations.langchain.com/vectorstores?integration_name=SupabaseVectorStore
@@ -92,7 +94,7 @@ class SupabaseVectorStore(VectorStore):
try:
import supabase # noqa: F401
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import supabase python package. "
"Please install it with `pip install supabase`."
)
diff --git a/libs/langchain/langchain/vectorstores/tair.py b/libs/langchain/langchain/vectorstores/tair.py
index 26108da59..e55ea4283 100644
--- a/libs/langchain/langchain/vectorstores/tair.py
+++ b/libs/langchain/langchain/vectorstores/tair.py
@@ -1,4 +1,3 @@
-"""Wrapper around Tair Vector."""
from __future__ import annotations
import json
@@ -19,7 +18,7 @@ def _uuid_key() -> str:
class Tair(VectorStore):
- """Wrapper around Tair Vector store."""
+ """`Tair` vector store."""
def __init__(
self,
diff --git a/libs/langchain/langchain/vectorstores/tigris.py b/libs/langchain/langchain/vectorstores/tigris.py
index 036103999..6abbfe676 100644
--- a/libs/langchain/langchain/vectorstores/tigris.py
+++ b/libs/langchain/langchain/vectorstores/tigris.py
@@ -15,12 +15,14 @@ if TYPE_CHECKING:
class Tigris(VectorStore):
+ """`Tigris` vector store."""
+
def __init__(self, client: TigrisClient, embeddings: Embeddings, index_name: str):
- """Initialize Tigris vector store"""
+ """Initialize Tigris vector store."""
try:
import tigrisdb # noqa: F401
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import tigrisdb python package. "
"Please install it with `pip install tigrisdb`"
)
diff --git a/libs/langchain/langchain/vectorstores/typesense.py b/libs/langchain/langchain/vectorstores/typesense.py
index a35b2cf0c..cdaaa5155 100644
--- a/libs/langchain/langchain/vectorstores/typesense.py
+++ b/libs/langchain/langchain/vectorstores/typesense.py
@@ -1,4 +1,3 @@
-"""Wrapper around Typesense vector search"""
from __future__ import annotations
import uuid
@@ -15,7 +14,7 @@ if TYPE_CHECKING:
class Typesense(VectorStore):
- """Wrapper around Typesense vector search.
+ """`Typesense` vector store.
To use, you should have the ``typesense`` python package installed.
@@ -61,7 +60,7 @@ class Typesense(VectorStore):
try:
from typesense import Client
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import typesense python package. "
"Please install it with `pip install typesense`."
)
diff --git a/libs/langchain/langchain/vectorstores/usearch.py b/libs/langchain/langchain/vectorstores/usearch.py
index 1d0e754e1..cb4e8c6eb 100644
--- a/libs/langchain/langchain/vectorstores/usearch.py
+++ b/libs/langchain/langchain/vectorstores/usearch.py
@@ -1,4 +1,3 @@
-"""Wrapper around USearch vector database."""
from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -27,7 +26,8 @@ def dependable_usearch_import() -> Any:
class USearch(VectorStore):
- """Wrapper around USearch vector database.
+ """`USearch` vector store.
+
To use, you should have the ``usearch`` python package installed.
"""
diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py
index cd8ee9c9f..eee2f1abe 100644
--- a/libs/langchain/langchain/vectorstores/vectara.py
+++ b/libs/langchain/langchain/vectorstores/vectara.py
@@ -1,4 +1,3 @@
-"""Wrapper around Vectara vector database."""
from __future__ import annotations
import json
@@ -18,7 +17,7 @@ logger = logging.getLogger(__name__)
class Vectara(VectorStore):
- """Implementation of Vector Store using Vectara.
+ """`Vectara API` vector store.
See (https://vectara.com).
@@ -426,7 +425,7 @@ class Vectara(VectorStore):
class VectaraRetriever(VectorStoreRetriever):
- """Retriever class for Vectara."""
+ """Retriever class for `Vectara`."""
vectorstore: Vectara
"""Vectara vectorstore."""
diff --git a/libs/langchain/langchain/vectorstores/weaviate.py b/libs/langchain/langchain/vectorstores/weaviate.py
index 0f54801f9..5f09a785d 100644
--- a/libs/langchain/langchain/vectorstores/weaviate.py
+++ b/libs/langchain/langchain/vectorstores/weaviate.py
@@ -1,4 +1,3 @@
-"""Wrapper around weaviate vector database."""
from __future__ import annotations
import datetime
@@ -44,7 +43,7 @@ def _create_weaviate_client(**kwargs: Any) -> Any:
try:
import weaviate
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import weaviate python package. "
"Please install it with `pip install weaviate-client`"
)
@@ -70,7 +69,7 @@ def _json_serializable(value: Any) -> Any:
class Weaviate(VectorStore):
- """Wrapper around Weaviate vector database.
+ """`Weaviate` vector store.
To use, you should have the ``weaviate-client`` python package installed.
diff --git a/libs/langchain/langchain/vectorstores/xata.py b/libs/langchain/langchain/vectorstores/xata.py
index ef25bc1a6..441fee825 100644
--- a/libs/langchain/langchain/vectorstores/xata.py
+++ b/libs/langchain/langchain/vectorstores/xata.py
@@ -1,5 +1,3 @@
-"""Wrapper around Xata as a vector database."""
-
from __future__ import annotations
import time
@@ -12,7 +10,9 @@ from langchain.vectorstores.base import VectorStore
class XataVectorStore(VectorStore):
- """VectorStore for a Xata database. Assumes you have a Xata database
+ """`Xata` vector store.
+
+ It assumes you have a Xata database
created with the right schema. See the guide at:
https://integrations.langchain.com/vectorstores?integration_name=XataVectorStore
@@ -29,7 +29,7 @@ class XataVectorStore(VectorStore):
try:
from xata.client import XataClient # noqa: F401
except ImportError:
- raise ValueError(
+ raise ImportError(
"Could not import xata python package. "
"Please install it with `pip install xata`."
)
diff --git a/libs/langchain/langchain/vectorstores/zep.py b/libs/langchain/langchain/vectorstores/zep.py
index faf04a139..98593ec91 100644
--- a/libs/langchain/langchain/vectorstores/zep.py
+++ b/libs/langchain/langchain/vectorstores/zep.py
@@ -22,8 +22,7 @@ logger = logging.getLogger()
@dataclass
class CollectionConfig:
- """
- A configuration class for a Zep Collection.
+ """Configuration for a `Zep Collection`.
If the collection does not exist, it will be created.
@@ -46,9 +45,9 @@ class CollectionConfig:
class ZepVectorStore(VectorStore):
- """
- ZepVectorStore is a VectorStore implementation that uses the Zep long-term memory
- store as a backend. It provides methods for adding texts or documents to the store,
+ """`Zep` vector store.
+
+ It provides methods for adding texts or documents to the store,
searching for similar documents, and deleting documents.
Search scores are calculated using cosine similarity normalized to [0, 1].
diff --git a/libs/langchain/langchain/vectorstores/zilliz.py b/libs/langchain/langchain/vectorstores/zilliz.py
index 835a03a74..8a571aca3 100644
--- a/libs/langchain/langchain/vectorstores/zilliz.py
+++ b/libs/langchain/langchain/vectorstores/zilliz.py
@@ -10,9 +10,9 @@ logger = logging.getLogger(__name__)
class Zilliz(Milvus):
- """Initialize wrapper around the Zilliz vector database.
+ """`Zilliz` vector store.
- In order to use this you need to have `pymilvus` installed and a
+ You need to have `pymilvus` installed and a
running Zilliz database.
See the following documentation for how to run a Zilliz instance:
From b048236c1a01d2bbcdb5c5c689246a69f13e51a9 Mon Sep 17 00:00:00 2001
From: Leonid Ganeline
Date: Wed, 23 Aug 2023 23:17:47 -0700
Subject: [PATCH 104/143] =?UTF-8?q?=F0=9F=93=96=20docs:=20`integrations/ag?=
=?UTF-8?q?ent=5Ftoolkits`=20=20(#9333)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Note: There are no changes in the file names!
- The group name on the main navbar changed: `Agent toolkits` -> `Agents
& Toolkits`. Examples here are the mix of the Agent and Toolkit examples
because Agents and Toolkits in examples are always used together.
- Titles changed: removed "Agent" and "Toolkit" suffixes. The reason is
the same.
- Formatting: mostly cleaning the header structure, so it could be
better on the right-side navbar.
Main navbar is looking much cleaner now.
---
.../integrations/toolkits/amadeus.ipynb | 9 +-
.../toolkits/azure_cognitive_services.ipynb | 8 +-
docs/extras/integrations/toolkits/csv.ipynb | 28 ++--
.../document_comparison_toolkit.ipynb | 26 +---
.../extras/integrations/toolkits/github.ipynb | 26 ++--
docs/extras/integrations/toolkits/gmail.ipynb | 6 +-
docs/extras/integrations/toolkits/index.mdx | 5 +-
docs/extras/integrations/toolkits/jira.ipynb | 74 +++++----
docs/extras/integrations/toolkits/json.ipynb | 7 +-
.../integrations/toolkits/multion.ipynb | 10 +-
.../integrations/toolkits/office365.ipynb | 9 +-
.../integrations/toolkits/openapi.ipynb | 14 +-
.../integrations/toolkits/openapi_nla.ipynb | 8 +-
.../extras/integrations/toolkits/pandas.ipynb | 14 +-
.../integrations/toolkits/playwright.ipynb | 22 +--
.../integrations/toolkits/powerbi.ipynb | 146 +++++++++---------
.../extras/integrations/toolkits/python.ipynb | 12 +-
docs/extras/integrations/toolkits/spark.ipynb | 19 ++-
.../integrations/toolkits/spark_sql.ipynb | 31 ++--
.../integrations/toolkits/sql_database.ipynb | 21 +--
.../integrations/toolkits/vectorstore.ipynb | 12 +-
.../integrations/toolkits/xorbits.ipynb | 8 +-
22 files changed, 248 insertions(+), 267 deletions(-)
diff --git a/docs/extras/integrations/toolkits/amadeus.ipynb b/docs/extras/integrations/toolkits/amadeus.ipynb
index afcaaccfb..baa9288dc 100644
--- a/docs/extras/integrations/toolkits/amadeus.ipynb
+++ b/docs/extras/integrations/toolkits/amadeus.ipynb
@@ -1,13 +1,12 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Amadeus Toolkit\n",
+ "# Amadeus\n",
"\n",
- "This notebook walks you through connecting LangChain to the Amadeus travel information API\n",
+ "This notebook walks you through connecting LangChain to the `Amadeus` travel information API\n",
"\n",
"To use this toolkit, you will need to set up your credentials explained in the [Amadeus for developers getting started overview](https://developers.amadeus.com/get-started/get-started-with-self-service-apis-335). Once you've received a AMADEUS_CLIENT_ID and AMADEUS_CLIENT_SECRET, you can input them as environmental variables below."
]
@@ -22,7 +21,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -46,7 +44,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -234,7 +231,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.4"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb
index 669519ba2..609cc2e4e 100644
--- a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb
+++ b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb
@@ -4,9 +4,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Azure Cognitive Services Toolkit\n",
+ "# Azure Cognitive Services\n",
"\n",
- "This toolkit is used to interact with the Azure Cognitive Services API to achieve some multimodal capabilities.\n",
+ "This toolkit is used to interact with the `Azure Cognitive Services API` to achieve some multimodal capabilities.\n",
"\n",
"Currently There are four tools bundled in this toolkit:\n",
"- AzureCogsImageAnalysisTool: used to extract caption, objects, tags, and text from images. (Note: this tool is not available on Mac OS yet, due to the dependency on `azure-ai-vision` package, which is only supported on Windows and Linux currently.)\n",
@@ -264,9 +264,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/docs/extras/integrations/toolkits/csv.ipynb b/docs/extras/integrations/toolkits/csv.ipynb
index 5a0ff426a..d64484d8e 100644
--- a/docs/extras/integrations/toolkits/csv.ipynb
+++ b/docs/extras/integrations/toolkits/csv.ipynb
@@ -5,24 +5,14 @@
"id": "7094e328",
"metadata": {},
"source": [
- "# CSV Agent\n",
+ "# CSV\n",
"\n",
- "This notebook shows how to use agents to interact with a csv. It is mostly optimized for question answering.\n",
+ "This notebook shows how to use agents to interact with data in `CSV` format. It is mostly optimized for question answering.\n",
"\n",
"**NOTE: this agent calls the Pandas DataFrame agent under the hood, which in turn calls the Python agent, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**\n",
"\n"
]
},
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "827982c7",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.agents import create_csv_agent"
- ]
- },
{
"cell_type": "code",
"execution_count": 2,
@@ -32,7 +22,9 @@
"source": [
"from langchain.llms import OpenAI\n",
"from langchain.chat_models import ChatOpenAI\n",
- "from langchain.agents.agent_types import AgentType"
+ "from langchain.agents.agent_types import AgentType\n",
+ "\n",
+ "from langchain.agents import create_csv_agent"
]
},
{
@@ -40,9 +32,9 @@
"id": "bd806175",
"metadata": {},
"source": [
- "## Using ZERO_SHOT_REACT_DESCRIPTION\n",
+ "## Using `ZERO_SHOT_REACT_DESCRIPTION`\n",
"\n",
- "This shows how to initialize the agent using the ZERO_SHOT_REACT_DESCRIPTION agent type. Note that this is an alternative to the above."
+ "This shows how to initialize the agent using the `ZERO_SHOT_REACT_DESCRIPTION` agent type. Note that this is an alternative to the above."
]
},
{
@@ -130,9 +122,7 @@
"cell_type": "code",
"execution_count": 5,
"id": "a96309be",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -305,7 +295,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/document_comparison_toolkit.ipynb b/docs/extras/integrations/toolkits/document_comparison_toolkit.ipynb
index 5dbe07551..7e79d0c36 100644
--- a/docs/extras/integrations/toolkits/document_comparison_toolkit.ipynb
+++ b/docs/extras/integrations/toolkits/document_comparison_toolkit.ipynb
@@ -91,9 +91,7 @@
"cell_type": "code",
"execution_count": 4,
"id": "c4d56c25",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -169,9 +167,7 @@
"cell_type": "code",
"execution_count": 6,
"id": "6db4c853",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -235,13 +231,7 @@
" \"prompts\": [\n",
" \"System: Use the following pieces of context to answer the users question. \\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\\n----------------\\nAlphabet Inc.\\nCONSOLIDATED STATEMENTS OF INCOME\\n(In millions, except per share amounts, unaudited)\\nQuarter Ended March 31,\\n2022 2023\\nRevenues $ 68,011 $ 69,787 \\nCosts and expenses:\\nCost of revenues 29,599 30,612 \\nResearch and development 9,119 11,468 \\nSales and marketing 5,825 6,533 \\nGeneral and administrative 3,374 3,759 \\nTotal costs and expenses 47,917 52,372 \\nIncome from operations 20,094 17,415 \\nOther income (expense), net (1,160) 790 \\nIncome before income taxes 18,934 18,205 \\nProvision for income taxes 2,498 3,154 \\nNet income $ 16,436 $ 15,051 \\nBasic earnings per share of Class A, Class B, and Class C stock $ 1.24 $ 1.18 \\nDiluted earnings per share of Class A, Class B, and Class C stock $ 1.23 $ 1.17 \\nNumber of shares used in basic earnings per share calculation 13,203 12,781 \\nNumber of shares used in diluted earnings per share calculation 13,351 12,823 \\n6\\n\\nAlphabet Announces First Quarter 2023 Results\\nMOUNTAIN VIEW, Calif. – April 25, 2023 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced financial \\nresults for the quarter ended March 31, 2023 .\\nSundar Pichai, CEO of Alphabet and Google, said: “We are pleased with our business performance in the first \\nquarter, with Search performing well and momentum in Cloud. We introduced important product updates anchored \\nin deep computer science and AI. Our North Star is providing the most helpful answers for our users, and we see \\nhuge opportunities ahead, continuing our long track record of innovation.”\\nRuth Porat, CFO of Alphabet and Google, said: “Resilience in Search and momentum in Cloud resulted in Q1 \\nconsolidated revenues of $69.8 billion, up 3% year over year, or up 6% in constant currency. We remain committed \\nto delivering long-term growth and creating capacity to invest in our most compelling growth areas by re-engineering \\nour cost base.”\\nQ1 2023 financial highlights (unaudited)\\nOur first quarter 2023 results reflect:\\ni.$2.6 billion in charges related to reductions in our workforce and office space; \\nii.a $988 million reduction in depreciation expense from the change in estimated useful life of our servers and \\ncertain network equipment; and\\niii.a shift in the timing of our annual employee stock-based compensation awards resulting in relatively less \\nstock-based compensation expense recognized in the first quarter compared to the remaining quarters of \\nthe ye ar. The shift in timing itself will not affect the amount of stock-based compensation expense over the \\nfull fiscal year 2023.\\nFor further information, please refer to our blog post also filed with the SEC via Form 8-K on April 20, 2023.\\nThe following table summarizes our consolidated financial results for the quarters ended March 31, 2022 and 2023 \\n(in millions, except for per share information and percentages). \\nQuarter Ended March 31,\\n2022 2023\\nRevenues $ 68,011 $ 69,787 \\nChange in revenues year over year 23 % 3 %\\nChange in constant currency revenues year over year(1) 26 % 6 %\\nOperating income $ 20,094 $ 17,415 \\nOperating margin 30 % 25 %\\nOther income (expense), net $ (1,160) $ 790 \\nNet income $ 16,436 $ 15,051 \\nDiluted EPS $ 1.23 $ 1.17 \\n(1) Non-GAAP measure. See the table captioned “Reconciliation from GAAP revenues to non-GAAP constant currency \\nrevenues and GAAP percentage change in revenues to non-GAAP percentage change in constant currency revenues” for \\nmore details.\\n\\nQ1 2023 supplemental information (in millions, except for number of employees; unaudited)\\nRevenues, T raffic Acquisition Costs (TAC), and number of employees\\nQuarter Ended March 31,\\n2022 2023\\nGoogle Search & other $ 39,618 $ 40,359 \\nYouTube ads 6,869 6,693 \\nGoogle Network 8,174 7,496 \\nGoogle advertising 54,661 54,548 \\nGoogle other 6,811 7,413 \\nGoogle Services total 61,472 61,961 \\nGoogle Cloud 5,821 7,454 \\nOther Bets 440 288 \\nHedging gains (losses) 278 84 \\nTotal revenues $ 68,011 $ 69,787 \\nTotal TAC $ 11,990 $ 11,721 \\nNumber of employees(1) 163,906 190,711 \\n(1) As of March 31, 2023, the number of employees includes almost all of the employees affected by the reduction of our \\nworkforce. We expect most of those affected will no longer be reflected in our headcount by the end of the second quarter \\nof 2023, subject to local law and consultation requirements.\\nSegment Operating Results\\nReflecting DeepMind’s increasing collaboration with Google Services, Google Cloud, and Other Bets, beginning in \\nthe first quarter of 2023 DeepMind is reported as part of Alphabet’s unallocated corporate costs instead of within \\nOther Bets. Additionally, beginning in the first quarter of 2023, we updated and simplified our cost allocation \\nmethodologies to provide our business leaders with increased transparency for decision-making . Prior periods have \\nbeen recast to reflect the revised presentation and are shown in Recast Historical Segment Results below .\\nAs announced on April 20, 2023 , we are bringing together part of Google Research (the Brain Team) and DeepMind \\nto significantly accelerate our progress in AI. This change does not affect first quarter reporting. The group, called \\nGoogle DeepMind, will be reported within Alphabet's unallocated corporate costs beginning in the second quarter of \\n2023.\\nQuarter Ended March 31,\\n2022 2023\\n(recast)\\nOperating income (loss):\\nGoogle Services $ 21,973 $ 21,737 \\nGoogle Cloud (706) 191 \\nOther Bets (835) (1,225) \\nCorporate costs, unallocated(1) (338) (3,288) \\nTotal income from operations $ 20,094 $ 17,415 \\n(1)Hedging gains (losses) related to revenue included in unallocated corporate costs were $278 million and $84 million for the \\nthree months ended March 31, 2022 and 2023 , respectively. For the three months ended March 31, 2023, unallocated \\ncorporate costs include charges related to the reductions in our workforce and office space totaling $2.5 billion . \\n2\\n\\nSegment results\\nThe following table presents our segment revenues and operating income (loss) (in millions; unaudited):\\nQuarter Ended March 31,\\n2022 2023\\n(recast)\\nRevenues:\\nGoogle Services $ 61,472 $ 61,961 \\nGoogle Cloud 5,821 7,454 \\nOther Bets 440 288 \\nHedging gains (losses) 278 84 \\nTotal revenues $ 68,011 $ 69,787 \\nOperating income (loss):\\nGoogle Services $ 21,973 $ 21,737 \\nGoogle Cloud (706) 191 \\nOther Bets (835) (1,225) \\nCorporate costs, unallocated (338) (3,288) \\nTotal income from operations $ 20,094 $ 17,415 \\nWe report our segment results as Google Services, Google Cloud, and Other Bets:\\n•Google Services includes products and services such as ads, Android, Chrome, hardware, Google Maps, \\nGoogle Play, Search, and YouTube. Google Services generates revenues primarily from advertising; sales \\nof apps and in-app purchases, and hardware; and fees received for subscription-based products such as \\nYouTube Premium and YouTube TV.\\n•Google Cloud includes infrastructure and platform services, collaboration tools, and other services for \\nenterprise customers. Google Cloud generates revenues from fees received for Google Cloud Platform \\nservices, Google Workspace communication and collaboration tools, and other enterprise services.\\n•Other Bets is a combination of multiple operating segments that are not individually material. Revenues \\nfrom Other Bets are generated primarily from the sale of health technology and internet services.\\nAfter the segment reporting changes discussed above, unallocated corporate costs primarily include AI-focused \\nshared R&D activities; corporate initiatives such as our philanthropic activities; and corporate shared costs such as \\nfinance, certain human resource costs, and legal, including certain fines and settlements. In the first quarter of 2023, \\nunallocated corporate costs also include charges associated with reductions in our workforce and office space. \\nAdditionally, hedging gains (losses) related to revenue are included in unallocated corporate costs.\\nRecast Historical Segment Results\\nRecast historical segment results are as follows (in millions; unaudited):\\nQuarter Fiscal Year\\nRecast Historical Results\\nQ1 2022 Q2 2022 Q3 2022 Q4 2022 2021 2022\\nOperating income (loss):\\nGoogle Services $ 21,973 $ 21,621 $ 18,883 $ 20,222 $ 88,132 $ 82,699 \\nGoogle Cloud (706) (590) (440) (186) (2,282) (1,922) \\nOther Bets (835) (1,339) (1,225) (1,237) (4,051) (4,636) \\nCorporate costs, unallocated(1) (338) (239) (83) (639) (3,085) (1,299) \\nTotal income from operations $ 20,094 $ 19,453 $ 17,135 $ 18,160 $ 78,714 $ 74,842 \\n(1)Includes hedging gains (losses); in fiscal years 2021 and 2022 hedging gains of $149 million and $2.0 billion, respectively.\\n8\\nHuman: What was Alphabet's revenue?\"\n",
" ]\n",
- "}\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
+ "}\n",
"\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:alphabet-earnings > 4:chain:RetrievalQA > 5:chain:StuffDocumentsChain > 6:chain:LLMChain > 7:llm:ChatOpenAI] [1.61s] Exiting LLM run with output:\n",
"\u001b[0m{\n",
" \"generations\": [\n",
@@ -299,13 +289,7 @@
" \"prompts\": [\n",
" \"System: Use the following pieces of context to answer the users question. \\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\\n----------------\\nS U M M A R Y H I G H L I G H T S \\n(1) Excludes SBC (stock -based compensation).\\n(2) Free cash flow = operating cash flow less capex.\\n(3) Includes cash, cash equivalents and investments.Profitability 11.4% operating margin in Q1\\n$2.7B GAAP operating income in Q1\\n$2.5B GAAP net income in Q1\\n$2.9B non -GAAP net income1in Q1In the current macroeconomic environment, we see this year as a unique \\nopportunity for Tesla. As many carmakers are working through challenges with the \\nunit economics of their EV programs, we aim to leverage our position as a cost \\nleader. We are focused on rapidly growing production, investments in autonomy \\nand vehicle software, and remaining on track with our growth investments.\\nOur near -term pricing strategy considers a long -term view on per vehicle \\nprofitability given the potential lifetime value of a Tesla vehicle through autonomy, \\nsupercharging, connectivity and service. We expect that our product pricing will \\ncontinue to evolve, upwards or downwards, depending on a number of factors.\\nAlthough we implemented price reductions on many vehicle models across regions \\nin the first quarter, our operating margins reduced at a manageable rate. We \\nexpect ongoing cost reduction of our vehicles, including improved production \\nefficiency at our newest factories and lower logistics costs, and remain focused on \\noperating leverage as we scale.\\nWe are rapidly growing energy storage production capacity at our Megafactory in \\nLathrop and we recently announced a new Megafactory in Shanghai. We are also \\ncontinuing to execute on our product roadmap, including Cybertruck, our next \\ngeneration vehicle platform, autonomy and other AI enabled products. \\nOur balance sheet and net income enable us to continue to make these capital \\nexpenditures in line with our future growth. In this environment, we believe it \\nmakes sense to push forward to ensure we lay a proper foundation for the best \\npossible future.Cash Operating cash flow of $2.5B\\nFree cash flow2of $0.4B in Q1\\n$0.2B increase in our cash and investments3in Q1 to $22.4B\\nOperations Cybertruck factory tooling on track; producing Alpha versions\\nModel Y was the best -selling vehicle in Europe in Q1\\nModel Y was the best -selling vehicle in the US in Q1 (ex -pickups)\\n\\n01234O T H E R H I G H L I G H T S\\n9Services & Other gross margin\\nEnergy Storage deployments (GWh)Energy Storage\\nEnergy storage deployments increased by 360% YoY in Q1 to 3.9 GWh, the highest \\nlevel of deployments we have achieved due to ongoing Megafactory ramp. The ramp of our 40 GWh Megapack factory in Lathrop, California has been successful with still more room to reach full capacity. This Megapack factory will be the first of many. We recently announced our second 40 GWh Megafactory, this time in Shanghai, with construction starting later this year. \\nSolar\\nSolar deployments increased by 40% YoY in Q1 to 67 MW, but declined sequentially in \\nthe quarter, predominantly due to volatile weather and other factors. In addition, the solar industry has been impacted by supply chain challenges.\\nServices and Other\\nBoth revenue and gross profit from Services and Other reached an all -time high in Q1 \\n2023. Within this business division, growth of used vehicle sales remained strong YoY and had healthy margins. Supercharging, while still a relatively small part of the business, continued to grow as we gradually open up the network to non- Tesla \\nvehicles. \\n-4%-2%0%2%4%6%8%\\nQ3'21 Q4'21 Q1'22 Q2'22 Q3'22 Q4'22 Q1'23\\n\\nIn millions of USD or shares as applicable, except per share data Q1-2022 Q2-2022 Q3-2022 Q4-2022 Q1-2023\\nREVENUES\\nAutomotive sales 15,514 13,670 17,785 20,241 18,878 \\nAutomotive regulatory credits 679 344 286 467 521 \\nAutomotive leasing 668 588 621 599 564 \\nTotal automotive revenues 16,861 14,602 18,692 21,307 19,963 \\nEnergy generation and storage 616 866 1,117 1,310 1,529 \\nServices and other 1,279 1,466 1,645 1,701 1,837 \\nTotal revenues 18,756 16,934 21,454 24,318 23,329 \\nCOST OF REVENUES\\nAutomotive sales 10,914 10,153 13,099 15,433 15,422 \\nAutomotive leasing 408 368 381 352 333 \\nTotal automotive cost of revenues 11,322 10,521 13,480 15,785 15,755 \\nEnergy generation and storage 688 769 1,013 1,151 1,361 \\nServices and other 1,286 1,410 1,579 1,605 1,702 \\nTotal cost of revenues 13,296 12,700 16,072 18,541 18,818 \\nGross profit 5,460 4,234 5,382 5,777 4,511 \\nOPERATING EXPENSES\\nResearch and development 865 667 733 810 771 \\nSelling, general and administrative 992 961 961 1,032 1,076 \\nRestructuring and other — 142 — 34 —\\nTotal operating expenses 1,857 1,770 1,694 1,876 1,847 \\nINCOME FROM OPERATIONS 3,603 2,464 3,688 3,901 2,664 \\nInterest income 28 26 86 157 213 \\nInterest expense (61) (44) (53) (33) (29)\\nOther income (expense), net 56 28 (85) (42) (48)\\nINCOME BEFORE INCOME TAXES 3,626 2,474 3,636 3,983 2,800 \\nProvision for income taxes 346 205 305 276 261 \\nNET INCOME 3,280 2,269 3,331 3,707 2,539 \\nNet (loss) income attributable to noncontrolling interests and redeemable noncontrolling interests in \\nsubsidiaries(38) 10 39 20 26 \\nNET INCOME ATTRIBUTABLE TO COMMON STOCKHOLDERS 3,318 2,259 3,292 3,687 2,513 \\nNet income per share of common stock attributable to common stockholders(1)\\nBasic $ 1.07 $ 0.73 $ 1.05 $ 1.18 $ 0.80 \\nDiluted $ 0.95 $ 0.65 $ 0.95 $ 1.07 $ 0.73 \\nWeighted average shares used in computing net income per share of common stock(1)\\nBasic 3,103 3,111 3,146 3,160 3,166\\nDiluted 3,472 3,464 3,468 3,471 3,468\\nS T A T E M E N T O F O P E R A T I O N S\\n(Unaudited)\\n23 (1) Prior period results have been retroactively adjusted to reflect the three -for-one stock split effected in the form of a stock d ividend in August 2022.\\n\\nQ1-2022 Q2-2022 Q3-2022 Q4-2022 Q1-2023 YoY\\nModel S/X production 14,218 16,411 19,935 20,613 19,437 37%\\nModel 3/Y production 291,189 242,169 345,988 419,088 421,371 45%\\nTotal production 305,407 258,580 365,923 439,701 440,808 44%\\nModel S/X deliveries 14,724 16,162 18,672 17,147 10,695 -27%\\nModel 3/Y deliveries 295,324 238,533 325,158 388,131 412,180 40%\\nTotal deliveries 310,048 254,695 343,830 405,278 422,875 36%\\nof which subject to operating lease accounting 12,167 9,227 11,004 15,184 22,357 84%\\nTotal end of quarter operating lease vehicle count 128,402 131,756 135,054 140,667 153,988 20%\\nGlobal vehicle inventory (days of supply )(1)3 4 8 13 15 400%\\nSolar deployed (MW) 48 106 94 100 67 40%\\nStorage deployed (MWh) 846 1,133 2,100 2,462 3,889 360%\\nTesla locations(2)787 831 903 963 1,000 27%\\nMobile service fleet 1,372 1,453 1,532 1,584 1,692 23%\\nSupercharger stations 3,724 3,971 4,283 4,678 4,947 33%\\nSupercharger connectors 33,657 36,165 38,883 42,419 45,169 34%\\n(1)Days of supply is calculated by dividing new car ending inventory by the relevant quarter’s deliveries and using 75 trading days (aligned with Automotive News definition).\\n(2)Starting in Q1 -2023, we revised our methodology for reporting Tesla’s physical footprint. This count now includes all sales, del ivery, body shop and service locations globally. O P E R A T I O N A L S U M MA R Y\\n(Unaudited)\\n6\\nHuman: What was Tesla's revenue?\"\n",
" ]\n",
- "}\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
+ "}\n",
"\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:tool:tesla-earnings > 9:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain > 12:llm:ChatOpenAI] [1.17s] Exiting LLM run with output:\n",
"\u001b[0m{\n",
" \"generations\": [\n",
@@ -427,7 +411,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/github.ipynb b/docs/extras/integrations/toolkits/github.ipynb
index bcaa5abd4..36d13cb7f 100644
--- a/docs/extras/integrations/toolkits/github.ipynb
+++ b/docs/extras/integrations/toolkits/github.ipynb
@@ -4,9 +4,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Github Toolkit\n",
+ "# Github\n",
"\n",
- "The Github toolkit contains tools that enable an LLM agent to interact with a github repository. The tools are a wrapper for the [PyGitHub](https://github.com/PyGithub/PyGithub) library. \n",
+ "The `Github` toolkit contains tools that enable an LLM agent to interact with a github repository. \n",
+ "The tool is a wrapper for the [PyGitHub](https://github.com/PyGithub/PyGithub) library. \n",
"\n",
"## Quickstart\n",
"1. Install the pygithub library\n",
@@ -38,7 +39,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 1. Install the pygithub library"
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Install the `pygithub` library "
]
},
{
@@ -58,7 +66,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 2. Create a Github App\n",
+ "### 2. Create a Github App\n",
"\n",
"[Follow the instructions here](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app) to create and register a Github app. Make sure your app has the following [repository permissions:](https://docs.github.com/en/rest/overview/permissions-required-for-github-apps?apiVersion=2022-11-28)\n",
"* Commit statuses (read only)\n",
@@ -71,7 +79,7 @@
"\n",
"Once the app has been registered, add it to the repository you wish the bot to act upon.\n",
"\n",
- "## 3. Set Environmental Variables\n",
+ "### 3. Set Environmental Variables\n",
"\n",
"Before initializing your agent, the following environmental variables need to be set:\n",
"\n",
@@ -86,7 +94,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Example Usage- Simple Agent"
+ "## Example: Simple Agent"
]
},
{
@@ -212,7 +220,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Example Usage- Advanced Agent\n",
+ "## Example: Advanced Agent\n",
"\n",
"If your agent does not need to use all 8 tools, you can build tools individually to use. For this example, we'll make an agent that does not use the create_file, delete_file or create_pull_request tools, but can also use duckduckgo-search."
]
@@ -375,9 +383,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.16"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/docs/extras/integrations/toolkits/gmail.ipynb b/docs/extras/integrations/toolkits/gmail.ipynb
index e2d6fee59..d24ded1f3 100644
--- a/docs/extras/integrations/toolkits/gmail.ipynb
+++ b/docs/extras/integrations/toolkits/gmail.ipynb
@@ -4,9 +4,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Gmail Toolkit\n",
+ "# Gmail\n",
"\n",
- "This notebook walks through connecting a LangChain email to the Gmail API.\n",
+ "This notebook walks through connecting a LangChain email to the `Gmail API`.\n",
"\n",
"To use this toolkit, you will need to set up your credentials explained in the [Gmail API docs](https://developers.google.com/gmail/api/quickstart/python#authorize_credentials_for_a_desktop_application). Once you've downloaded the `credentials.json` file, you can start using the Gmail API. Once this is done, we'll install the required libraries."
]
@@ -226,7 +226,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.2"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/index.mdx b/docs/extras/integrations/toolkits/index.mdx
index 164addc70..65f3854d3 100644
--- a/docs/extras/integrations/toolkits/index.mdx
+++ b/docs/extras/integrations/toolkits/index.mdx
@@ -2,7 +2,10 @@
sidebar_position: 0
---
-# Agent toolkits
+# Agents & Toolkits
+
+Agents and Toolkits are placed in the same directory because they are always used together.
+
import DocCardList from "@theme/DocCardList";
diff --git a/docs/extras/integrations/toolkits/jira.ipynb b/docs/extras/integrations/toolkits/jira.ipynb
index 9d32bab37..39480eeb5 100644
--- a/docs/extras/integrations/toolkits/jira.ipynb
+++ b/docs/extras/integrations/toolkits/jira.ipynb
@@ -1,15 +1,15 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"id": "245a954a",
"metadata": {},
"source": [
"# Jira\n",
"\n",
- "This notebook goes over how to use the Jira tool.\n",
- "The Jira tool allows agents to interact with a given Jira instance, performing actions such as searching for issues and creating issues, the tool wraps the atlassian-python-api library, for more see: https://atlassian-python-api.readthedocs.io/jira.html\n",
+ "This notebook goes over how to use the `Jira` toolkit.\n",
+ "\n",
+ "The `Jira` toolkit allows agents to interact with a given Jira instance, performing actions such as searching for issues and creating issues, the tool wraps the atlassian-python-api library, for more see: https://atlassian-python-api.readthedocs.io/jira.html\n",
"\n",
"To use this tool, you must first set as environment variables:\n",
" JIRA_API_TOKEN\n",
@@ -22,12 +22,12 @@
"execution_count": null,
"id": "961b3689",
"metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-04-17T10:21:20.168639Z",
+ "start_time": "2023-04-17T10:21:18.698672Z"
+ },
"vscode": {
"languageId": "shellscript"
- },
- "ExecuteTime": {
- "start_time": "2023-04-17T10:21:18.698672Z",
- "end_time": "2023-04-17T10:21:20.168639Z"
}
},
"outputs": [],
@@ -41,8 +41,8 @@
"id": "34bb5968",
"metadata": {
"ExecuteTime": {
- "start_time": "2023-04-17T10:21:22.911233Z",
- "end_time": "2023-04-17T10:21:23.730922Z"
+ "end_time": "2023-04-17T10:21:23.730922Z",
+ "start_time": "2023-04-17T10:21:22.911233Z"
}
},
"outputs": [],
@@ -58,21 +58,24 @@
{
"cell_type": "code",
"execution_count": 4,
+ "id": "b3050b55",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-04-17T10:22:42.505412Z",
+ "start_time": "2023-04-17T10:22:42.499447Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
"outputs": [],
"source": [
"os.environ[\"JIRA_API_TOKEN\"] = \"abc\"\n",
"os.environ[\"JIRA_USERNAME\"] = \"123\"\n",
"os.environ[\"JIRA_INSTANCE_URL\"] = \"https://jira.atlassian.com\"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"xyz\""
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "start_time": "2023-04-17T10:22:42.499447Z",
- "end_time": "2023-04-17T10:22:42.505412Z"
- }
- },
- "id": "b3050b55"
+ ]
},
{
"cell_type": "code",
@@ -80,8 +83,8 @@
"id": "ac4910f8",
"metadata": {
"ExecuteTime": {
- "start_time": "2023-04-17T10:22:44.664481Z",
- "end_time": "2023-04-17T10:22:44.720538Z"
+ "end_time": "2023-04-17T10:22:44.720538Z",
+ "start_time": "2023-04-17T10:22:44.664481Z"
}
},
"outputs": [],
@@ -97,6 +100,17 @@
{
"cell_type": "code",
"execution_count": 9,
+ "id": "d5461370",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-04-17T10:23:38.121883Z",
+ "start_time": "2023-04-17T10:23:33.662454Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -117,7 +131,9 @@
},
{
"data": {
- "text/plain": "'A new issue has been created in project PW with the summary \"Make more fried rice\" and description \"Reminder to make more fried rice\".'"
+ "text/plain": [
+ "'A new issue has been created in project PW with the summary \"Make more fried rice\" and description \"Reminder to make more fried rice\".'"
+ ]
},
"execution_count": 9,
"metadata": {},
@@ -126,20 +142,12 @@
],
"source": [
"agent.run(\"make a new issue in project PW to remind me to make more fried rice\")"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "start_time": "2023-04-17T10:23:33.662454Z",
- "end_time": "2023-04-17T10:23:38.121883Z"
- }
- },
- "id": "d5461370"
+ ]
}
],
"metadata": {
"kernelspec": {
- "display_name": ".venv",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -153,7 +161,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.7"
+ "version": "3.10.12"
},
"vscode": {
"interpreter": {
@@ -163,4 +171,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/docs/extras/integrations/toolkits/json.ipynb b/docs/extras/integrations/toolkits/json.ipynb
index ec34583dd..896141013 100644
--- a/docs/extras/integrations/toolkits/json.ipynb
+++ b/docs/extras/integrations/toolkits/json.ipynb
@@ -5,9 +5,10 @@
"id": "85fb2c03-ab88-4c8c-97e3-a7f2954555ab",
"metadata": {},
"source": [
- "# JSON Agent\n",
+ "# JSON\n",
"\n",
- "This notebook showcases an agent designed to interact with large JSON/dict objects. This is useful when you want to answer questions about a JSON blob that's too large to fit in the context window of an LLM. The agent is able to iteratively explore the blob to find what it needs to answer the user's question.\n",
+ "This notebook showcases an agent interacting with large `JSON/dict` objects. \n",
+ "This is useful when you want to answer questions about a JSON blob that's too large to fit in the context window of an LLM. The agent is able to iteratively explore the blob to find what it needs to answer the user's question.\n",
"\n",
"In the below example, we are using the OpenAPI spec for the OpenAI API, which you can find [here](https://github.com/openai/openai-openapi/blob/master/openapi.yaml).\n",
"\n",
@@ -179,7 +180,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.9"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/multion.ipynb b/docs/extras/integrations/toolkits/multion.ipynb
index 3382af621..5502d3e70 100644
--- a/docs/extras/integrations/toolkits/multion.ipynb
+++ b/docs/extras/integrations/toolkits/multion.ipynb
@@ -1,15 +1,14 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "# MultiOn Toolkit\n",
+ "# MultiOn\n",
"\n",
- "This notebook walks you through connecting LangChain to the MultiOn Client in your browser\n",
+ "This notebook walks you through connecting LangChain to the `MultiOn` Client in your browser\n",
"\n",
- "To use this toolkit, you will need to add MultiOn Extension to your browser as explained in the [MultiOn for Chrome](https://multion.notion.site/Download-MultiOn-ddddcfe719f94ab182107ca2612c07a5)."
+ "To use this toolkit, you will need to add `MultiOn Extension` to your browser as explained in the [MultiOn for Chrome](https://multion.notion.site/Download-MultiOn-ddddcfe719f94ab182107ca2612c07a5)."
]
},
{
@@ -47,7 +46,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -127,7 +125,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.4"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/office365.ipynb b/docs/extras/integrations/toolkits/office365.ipynb
index 704ceec4e..350bcc049 100644
--- a/docs/extras/integrations/toolkits/office365.ipynb
+++ b/docs/extras/integrations/toolkits/office365.ipynb
@@ -1,13 +1,12 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Office365 Toolkit\n",
+ "# Office365\n",
"\n",
- "This notebook walks through connecting LangChain to Office365 email and calendar.\n",
+ "This notebook walks through connecting LangChain to `Office365` email and calendar.\n",
"\n",
"To use this toolkit, you will need to set up your credentials explained in the [Microsoft Graph authentication and authorization overview](https://learn.microsoft.com/en-us/graph/auth/). Once you've received a CLIENT_ID and CLIENT_SECRET, you can input them as environmental variables below."
]
@@ -23,7 +22,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -42,7 +40,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -238,7 +235,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/openapi.ipynb b/docs/extras/integrations/toolkits/openapi.ipynb
index 3e5e4d136..f97532e36 100644
--- a/docs/extras/integrations/toolkits/openapi.ipynb
+++ b/docs/extras/integrations/toolkits/openapi.ipynb
@@ -5,9 +5,9 @@
"id": "85fb2c03-ab88-4c8c-97e3-a7f2954555ab",
"metadata": {},
"source": [
- "# OpenAPI agents\n",
+ "# OpenAPI\n",
"\n",
- "We can construct agents to consume arbitrary APIs, here APIs conformant to the OpenAPI/Swagger specification."
+ "We can construct agents to consume arbitrary APIs, here APIs conformant to the `OpenAPI`/`Swagger` specification."
]
},
{
@@ -271,9 +271,7 @@
"cell_type": "code",
"execution_count": 9,
"id": "38762cc0",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -449,9 +447,7 @@
"cell_type": "code",
"execution_count": 28,
"id": "3a9cc939",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -773,7 +769,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/openapi_nla.ipynb b/docs/extras/integrations/toolkits/openapi_nla.ipynb
index c2f3b90e4..a731e282d 100644
--- a/docs/extras/integrations/toolkits/openapi_nla.ipynb
+++ b/docs/extras/integrations/toolkits/openapi_nla.ipynb
@@ -7,7 +7,9 @@
"source": [
"# Natural Language APIs\n",
"\n",
- "Natural Language API Toolkits (NLAToolkits) permit LangChain Agents to efficiently plan and combine calls across endpoints. This notebook demonstrates a sample composition of the Speak, Klarna, and Spoonacluar APIs.\n",
+ "`Natural Language API` Toolkits (`NLAToolkits`) permit LangChain Agents to efficiently plan and combine calls across endpoints. \n",
+ "\n",
+ "This notebook demonstrates a sample composition of the `Speak`, `Klarna`, and `Spoonacluar` APIs.\n",
"\n",
"For a detailed walkthrough of the OpenAPI chains wrapped within the NLAToolkit, see the [OpenAPI Operation Chain](/docs/use_cases/apis/openapi.html) notebook.\n",
"\n",
@@ -182,7 +184,7 @@
"id": "c61d92a8",
"metadata": {},
"source": [
- "### Using Auth + Adding more Endpoints\n",
+ "### Use Auth and add more Endpoints\n",
"\n",
"Some endpoints may require user authentication via things like access tokens. Here we show how to pass in the authentication information via the `Requests` wrapper object.\n",
"\n",
@@ -420,7 +422,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/pandas.ipynb b/docs/extras/integrations/toolkits/pandas.ipynb
index b54b0076c..000eaa0dc 100644
--- a/docs/extras/integrations/toolkits/pandas.ipynb
+++ b/docs/extras/integrations/toolkits/pandas.ipynb
@@ -5,11 +5,11 @@
"id": "c81da886",
"metadata": {},
"source": [
- "# Pandas Dataframe Agent\n",
+ "# Pandas Dataframe\n",
"\n",
- "This notebook shows how to use agents to interact with a pandas dataframe. It is mostly optimized for question answering.\n",
+ "This notebook shows how to use agents to interact with a `Pandas DataFrame`. It is mostly optimized for question answering.\n",
"\n",
- "**NOTE: this agent calls the Python agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**"
+ "**NOTE: this agent calls the `Python` agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**"
]
},
{
@@ -42,9 +42,9 @@
"id": "a62858e2",
"metadata": {},
"source": [
- "## Using ZERO_SHOT_REACT_DESCRIPTION\n",
+ "## Using `ZERO_SHOT_REACT_DESCRIPTION`\n",
"\n",
- "This shows how to initialize the agent using the ZERO_SHOT_REACT_DESCRIPTION agent type. Note that this is an alternative to the above."
+ "This shows how to initialize the agent using the `ZERO_SHOT_REACT_DESCRIPTION` agent type. Note that this is an alternative to the above."
]
},
{
@@ -212,7 +212,7 @@
"id": "c4bc0584",
"metadata": {},
"source": [
- "### Multi DataFrame Example\n",
+ "## Multi DataFrame Example\n",
"\n",
"This next part shows how the agent can interact with multiple dataframes passed in as a list."
]
@@ -292,7 +292,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/playwright.ipynb b/docs/extras/integrations/toolkits/playwright.ipynb
index 50d2825da..ccf569506 100644
--- a/docs/extras/integrations/toolkits/playwright.ipynb
+++ b/docs/extras/integrations/toolkits/playwright.ipynb
@@ -4,17 +4,19 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# PlayWright Browser Toolkit\n",
+ "# PlayWright Browser\n",
"\n",
- "This toolkit is used to interact with the browser. While other tools (like the Requests tools) are fine for static sites, Browser toolkits let your agent navigate the web and interact with dynamically rendered sites. Some tools bundled within the Browser toolkit include:\n",
+ "This toolkit is used to interact with the browser. While other tools (like the `Requests` tools) are fine for static sites, `PlayWright Browser` toolkits let your agent navigate the web and interact with dynamically rendered sites. \n",
"\n",
- "- NavigateTool (navigate_browser) - navigate to a URL\n",
- "- NavigateBackTool (previous_page) - wait for an element to appear\n",
- "- ClickTool (click_element) - click on an element (specified by selector)\n",
- "- ExtractTextTool (extract_text) - use beautiful soup to extract text from the current web page\n",
- "- ExtractHyperlinksTool (extract_hyperlinks) - use beautiful soup to extract hyperlinks from the current web page\n",
- "- GetElementsTool (get_elements) - select elements by CSS selector\n",
- "- CurrentPageTool (current_page) - get the current page URL\n"
+ "Some tools bundled within the `PlayWright Browser` toolkit include:\n",
+ "\n",
+ "- `NavigateTool` (navigate_browser) - navigate to a URL\n",
+ "- `NavigateBackTool` (previous_page) - wait for an element to appear\n",
+ "- `ClickTool` (click_element) - click on an element (specified by selector)\n",
+ "- `ExtractTextTool` (extract_text) - use beautiful soup to extract text from the current web page\n",
+ "- `ExtractHyperlinksTool` (extract_hyperlinks) - use beautiful soup to extract hyperlinks from the current web page\n",
+ "- `GetElementsTool` (get_elements) - select elements by CSS selector\n",
+ "- `CurrentPageTool` (current_page) - get the current page URL\n"
]
},
{
@@ -327,7 +329,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.2"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/powerbi.ipynb b/docs/extras/integrations/toolkits/powerbi.ipynb
index 8ca60a965..475e66e61 100644
--- a/docs/extras/integrations/toolkits/powerbi.ipynb
+++ b/docs/extras/integrations/toolkits/powerbi.ipynb
@@ -2,36 +2,40 @@
"cells": [
{
"cell_type": "markdown",
+ "id": "9363398d",
+ "metadata": {},
"source": [
- "# PowerBI Dataset Agent\n",
+ "# PowerBI Dataset\n",
"\n",
- "This notebook showcases an agent designed to interact with a Power BI Dataset. The agent is designed to answer more general questions about a dataset, as well as recover from errors.\n",
+ "This notebook showcases an agent interacting with a `Power BI Dataset`. The agent is answering more general questions about a dataset, as well as recover from errors.\n",
"\n",
"Note that, as this agent is in active development, all answers might not be correct. It runs against the [executequery endpoint](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries), which does not allow deletes.\n",
"\n",
- "### Some notes\n",
+ "### Notes:\n",
"- It relies on authentication with the azure.identity package, which can be installed with `pip install azure-identity`. Alternatively you can create the powerbi dataset with a token as a string without supplying the credentials.\n",
"- You can also supply a username to impersonate for use with datasets that have RLS enabled. \n",
"- The toolkit uses a LLM to create the query from the question, the agent uses the LLM for the overall execution.\n",
"- Testing was done mostly with a `text-davinci-003` model, codex models did not seem to perform ver well."
- ],
- "metadata": {},
- "attachments": {},
- "id": "9363398d"
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "## Initialization"
- ],
+ "id": "0725445e",
"metadata": {
"tags": []
},
- "id": "0725445e"
+ "source": [
+ "## Initialization"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "c82f33e9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
"source": [
"from langchain.agents.agent_toolkits import create_pbi_agent\n",
"from langchain.agents.agent_toolkits import PowerBIToolkit\n",
@@ -39,16 +43,16 @@
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import AgentExecutor\n",
"from azure.identity import DefaultAzureCredential"
- ],
- "outputs": [],
- "metadata": {
- "tags": []
- },
- "id": "c82f33e9"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "0b2c5853",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
"source": [
"fast_llm = ChatOpenAI(\n",
" temperature=0.5, max_tokens=1000, model_name=\"gpt-3.5-turbo\", verbose=True\n",
@@ -69,99 +73,95 @@
" toolkit=toolkit,\n",
" verbose=True,\n",
")"
- ],
- "outputs": [],
- "metadata": {
- "tags": []
- },
- "id": "0b2c5853"
+ ]
},
{
"cell_type": "markdown",
+ "id": "80c92be3",
+ "metadata": {},
"source": [
"## Example: describing a table"
- ],
- "metadata": {},
- "id": "80c92be3"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "source": [
- "agent_executor.run(\"Describe table1\")"
- ],
- "outputs": [],
+ "id": "90f236cb",
"metadata": {
"tags": []
},
- "id": "90f236cb"
+ "outputs": [],
+ "source": [
+ "agent_executor.run(\"Describe table1\")"
+ ]
},
{
"cell_type": "markdown",
+ "id": "b464930f",
+ "metadata": {},
"source": [
"## Example: simple query on a table\n",
"In this example, the agent actually figures out the correct query to get a row count of the table."
- ],
- "metadata": {},
- "attachments": {},
- "id": "b464930f"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "b668c907",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
"source": [
"agent_executor.run(\"How many records are in table1?\")"
- ],
- "outputs": [],
- "metadata": {
- "tags": []
- },
- "id": "b668c907"
+ ]
},
{
"cell_type": "markdown",
+ "id": "f2229a2f",
+ "metadata": {},
"source": [
"## Example: running queries"
- ],
- "metadata": {},
- "id": "f2229a2f"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "865a420f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
"source": [
"agent_executor.run(\"How many records are there by dimension1 in table2?\")"
- ],
- "outputs": [],
- "metadata": {
- "tags": []
- },
- "id": "865a420f"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "source": [
- "agent_executor.run(\"What unique values are there for dimensions2 in table2\")"
- ],
- "outputs": [],
+ "id": "120cd49a",
"metadata": {
"tags": []
},
- "id": "120cd49a"
+ "outputs": [],
+ "source": [
+ "agent_executor.run(\"What unique values are there for dimensions2 in table2\")"
+ ]
},
{
"cell_type": "markdown",
+ "id": "ac584fb2",
+ "metadata": {},
"source": [
"## Example: add your own few-shot prompts"
- ],
- "metadata": {},
- "attachments": {},
- "id": "ac584fb2"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "ffa66827",
+ "metadata": {},
+ "outputs": [],
"source": [
"# fictional example\n",
"few_shots = \"\"\"\n",
@@ -189,26 +189,27 @@
" toolkit=toolkit,\n",
" verbose=True,\n",
")"
- ],
- "outputs": [],
- "metadata": {},
- "id": "ffa66827"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "3be44685",
+ "metadata": {},
+ "outputs": [],
"source": [
"agent_executor.run(\"What was the maximum of value in revenue in dollars in 2022?\")"
- ],
- "outputs": [],
- "metadata": {},
- "id": "3be44685"
+ ]
}
],
"metadata": {
+ "interpreter": {
+ "hash": "397704579725e15f5c7cb49fe5f0341eb7531c82d19f2c29d197e8b64ab5776b"
+ },
"kernelspec": {
- "name": "python3",
- "display_name": "Python 3.9.16 64-bit"
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -220,12 +221,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.16"
- },
- "interpreter": {
- "hash": "397704579725e15f5c7cb49fe5f0341eb7531c82d19f2c29d197e8b64ab5776b"
+ "version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/docs/extras/integrations/toolkits/python.ipynb b/docs/extras/integrations/toolkits/python.ipynb
index 41faeff3f..3c1f6b50c 100644
--- a/docs/extras/integrations/toolkits/python.ipynb
+++ b/docs/extras/integrations/toolkits/python.ipynb
@@ -5,9 +5,9 @@
"id": "82a4c2cc-20ea-4b20-a565-63e905dee8ff",
"metadata": {},
"source": [
- "# Python Agent\n",
+ "# Python\n",
"\n",
- "This notebook showcases an agent designed to write and execute python code to answer a question."
+ "This notebook showcases an agent designed to write and execute `Python` code to answer a question."
]
},
{
@@ -32,7 +32,7 @@
"id": "ca30d64c",
"metadata": {},
"source": [
- "## Using ZERO_SHOT_REACT_DESCRIPTION\n",
+ "## Using `ZERO_SHOT_REACT_DESCRIPTION`\n",
"\n",
"This shows how to initialize the agent using the ZERO_SHOT_REACT_DESCRIPTION agent type."
]
@@ -149,9 +149,7 @@
"cell_type": "code",
"execution_count": 5,
"id": "4b9f60e7-eb6a-4f14-8604-498d863d4482",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -271,7 +269,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/spark.ipynb b/docs/extras/integrations/toolkits/spark.ipynb
index 7cab26251..d55075c2b 100644
--- a/docs/extras/integrations/toolkits/spark.ipynb
+++ b/docs/extras/integrations/toolkits/spark.ipynb
@@ -1,13 +1,12 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Spark Dataframe Agent\n",
+ "# Spark Dataframe\n",
"\n",
- "This notebook shows how to use agents to interact with a Spark dataframe and Spark Connect. It is mostly optimized for question answering.\n",
+ "This notebook shows how to use agents to interact with a `Spark DataFrame` and `Spark Connect`. It is mostly optimized for question answering.\n",
"\n",
"**NOTE: this agent calls the Python agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**"
]
@@ -23,6 +22,13 @@
"os.environ[\"OPENAI_API_KEY\"] = \"...input your openai api key here...\""
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `Spark DataFrame` example"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 2,
@@ -225,11 +231,10 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Spark Connect Example"
+ "## `Spark Connect` example"
]
},
{
@@ -405,9 +410,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/docs/extras/integrations/toolkits/spark_sql.ipynb b/docs/extras/integrations/toolkits/spark_sql.ipynb
index c29f6841c..7ed93552c 100644
--- a/docs/extras/integrations/toolkits/spark_sql.ipynb
+++ b/docs/extras/integrations/toolkits/spark_sql.ipynb
@@ -4,9 +4,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Spark SQL Agent\n",
+ "# Spark SQL\n",
"\n",
- "This notebook shows how to use agents to interact with a Spark SQL. Similar to [SQL Database Agent](https://python.langchain.com/docs/integrations/toolkits/sql_database), it is designed to address general inquiries about Spark SQL and facilitate error recovery.\n",
+ "This notebook shows how to use agents to interact with `Spark SQL`. Similar to [SQL Database Agent](https://python.langchain.com/docs/integrations/toolkits/sql_database), it is designed to address general inquiries about `Spark SQL` and facilitate error recovery.\n",
"\n",
"**NOTE: Note that, as this agent is in active development, all answers might not be correct. Additionally, it is not guaranteed that the agent won't perform DML statements on your Spark cluster given certain questions. Be careful running it on sensitive data!**"
]
@@ -163,7 +163,9 @@
},
{
"data": {
- "text/plain": "'The titanic table has the following columns: PassengerId (INT), Survived (INT), Pclass (INT), Name (STRING), Sex (STRING), Age (DOUBLE), SibSp (INT), Parch (INT), Ticket (STRING), Fare (DOUBLE), Cabin (STRING), and Embarked (STRING). Here are some sample rows from the table: \\n\\n1. PassengerId: 1, Survived: 0, Pclass: 3, Name: Braund, Mr. Owen Harris, Sex: male, Age: 22.0, SibSp: 1, Parch: 0, Ticket: A/5 21171, Fare: 7.25, Cabin: None, Embarked: S\\n2. PassengerId: 2, Survived: 1, Pclass: 1, Name: Cumings, Mrs. John Bradley (Florence Briggs Thayer), Sex: female, Age: 38.0, SibSp: 1, Parch: 0, Ticket: PC 17599, Fare: 71.2833, Cabin: C85, Embarked: C\\n3. PassengerId: 3, Survived: 1, Pclass: 3, Name: Heikkinen, Miss. Laina, Sex: female, Age: 26.0, SibSp: 0, Parch: 0, Ticket: STON/O2. 3101282, Fare: 7.925, Cabin: None, Embarked: S'"
+ "text/plain": [
+ "'The titanic table has the following columns: PassengerId (INT), Survived (INT), Pclass (INT), Name (STRING), Sex (STRING), Age (DOUBLE), SibSp (INT), Parch (INT), Ticket (STRING), Fare (DOUBLE), Cabin (STRING), and Embarked (STRING). Here are some sample rows from the table: \\n\\n1. PassengerId: 1, Survived: 0, Pclass: 3, Name: Braund, Mr. Owen Harris, Sex: male, Age: 22.0, SibSp: 1, Parch: 0, Ticket: A/5 21171, Fare: 7.25, Cabin: None, Embarked: S\\n2. PassengerId: 2, Survived: 1, Pclass: 1, Name: Cumings, Mrs. John Bradley (Florence Briggs Thayer), Sex: female, Age: 38.0, SibSp: 1, Parch: 0, Ticket: PC 17599, Fare: 71.2833, Cabin: C85, Embarked: C\\n3. PassengerId: 3, Survived: 1, Pclass: 3, Name: Heikkinen, Miss. Laina, Sex: female, Age: 26.0, SibSp: 0, Parch: 0, Ticket: STON/O2. 3101282, Fare: 7.925, Cabin: None, Embarked: S'"
+ ]
},
"execution_count": 4,
"metadata": {},
@@ -239,7 +241,9 @@
},
{
"data": {
- "text/plain": "'The square root of the average age is approximately 5.45.'"
+ "text/plain": [
+ "'The square root of the average age is approximately 5.45.'"
+ ]
},
"execution_count": 5,
"metadata": {},
@@ -253,6 +257,12 @@
{
"cell_type": "code",
"execution_count": 6,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -305,7 +315,9 @@
},
{
"data": {
- "text/plain": "'The oldest survived passenger is Barkworth, Mr. Algernon Henry Wilson, who was 80 years old.'"
+ "text/plain": [
+ "'The oldest survived passenger is Barkworth, Mr. Algernon Henry Wilson, who was 80 years old.'"
+ ]
},
"execution_count": 6,
"metadata": {},
@@ -314,10 +326,7 @@
],
"source": [
"agent_executor.run(\"What's the name of the oldest survived passenger?\")"
- ],
- "metadata": {
- "collapsed": false
- }
+ ]
}
],
"metadata": {
@@ -336,9 +345,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.2"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/docs/extras/integrations/toolkits/sql_database.ipynb b/docs/extras/integrations/toolkits/sql_database.ipynb
index 9fbc31da2..eae793da1 100644
--- a/docs/extras/integrations/toolkits/sql_database.ipynb
+++ b/docs/extras/integrations/toolkits/sql_database.ipynb
@@ -1,22 +1,21 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"id": "0e499e90-7a6d-4fab-8aab-31a4df417601",
"metadata": {},
"source": [
- "# SQL Database Agent\n",
+ "# SQL Database\n",
"\n",
- "This notebook showcases an agent designed to interact with a sql databases. The agent builds off of [SQLDatabaseChain](https://python.langchain.com/docs/use_cases/tabular/sqlite) and is designed to answer more general questions about a database, as well as recover from errors.\n",
+ "This notebook showcases an agent designed to interact with a `SQL` databases. \n",
+ "The agent builds off of [SQLDatabaseChain](https://python.langchain.com/docs/use_cases/tabular/sqlite) and is designed to answer more general questions about a database, as well as recover from errors.\n",
"\n",
"Note that, as this agent is in active development, all answers might not be correct. Additionally, it is not guaranteed that the agent won't perform DML statements on your database given certain questions. Be careful running it on sensitive data!\n",
"\n",
- "This uses the example Chinook database. To set it up follow the instructions on https://database.guide/2-sample-databases-sqlite/, placing the .db file in a notebooks folder at the root of this repository."
+ "This uses the example `Chinook` database. To set it up follow the instructions on https://database.guide/2-sample-databases-sqlite/, placing the .db file in a notebooks folder at the root of this repository."
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "ec927ac6-9b2a-4e8a-9a6e-3e429191875c",
"metadata": {
@@ -56,12 +55,11 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "f74d1792",
"metadata": {},
"source": [
- "## Using ZERO_SHOT_REACT_DESCRIPTION\n",
+ "## Using `ZERO_SHOT_REACT_DESCRIPTION`\n",
"\n",
"This shows how to initialize the agent using the ZERO_SHOT_REACT_DESCRIPTION agent type."
]
@@ -84,7 +82,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "971cc455",
"metadata": {},
@@ -110,7 +107,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "54c01168",
"metadata": {},
@@ -136,7 +132,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "5a4a9455",
"metadata": {},
@@ -147,7 +142,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "36ae48c7-cb08-4fef-977e-c7d4b96a464b",
"metadata": {},
@@ -237,7 +231,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "9abcfe8e-1868-42a4-8345-ad2d9b44c681",
"metadata": {},
@@ -312,7 +305,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "6fbc26af-97e4-4a21-82aa-48bdc992da26",
"metadata": {},
@@ -495,7 +487,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "7c7503b5-d9d9-4faa-b064-29fcdb5ff213",
"metadata": {},
@@ -639,7 +630,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/vectorstore.ipynb b/docs/extras/integrations/toolkits/vectorstore.ipynb
index 69ac05bd5..db388fdb0 100644
--- a/docs/extras/integrations/toolkits/vectorstore.ipynb
+++ b/docs/extras/integrations/toolkits/vectorstore.ipynb
@@ -1,23 +1,21 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"id": "18ada398-dce6-4049-9b56-fc0ede63da9c",
"metadata": {},
"source": [
- "# Vectorstore Agent\n",
+ "# Vectorstore\n",
"\n",
"This notebook showcases an agent designed to retrieve information from one or more vectorstores, either with or without sources."
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "eecb683b-3a46-4b9d-81a3-7caefbfec1a1",
"metadata": {},
"source": [
- "## Create the Vectorstores"
+ "## Create Vectorstores"
]
},
{
@@ -95,7 +93,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "f4814175-964d-42f1-aa9d-22801ce1e912",
"metadata": {},
@@ -128,7 +125,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "8a38ad10",
"metadata": {},
@@ -217,7 +213,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "7ca07707",
"metadata": {},
@@ -263,7 +258,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "71680984-edaf-4a63-90f5-94edbd263550",
"metadata": {},
@@ -422,7 +416,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/docs/extras/integrations/toolkits/xorbits.ipynb b/docs/extras/integrations/toolkits/xorbits.ipynb
index dd3e6a108..c97ca83b6 100644
--- a/docs/extras/integrations/toolkits/xorbits.ipynb
+++ b/docs/extras/integrations/toolkits/xorbits.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Xorbits Agent"
+ "# Xorbits"
]
},
{
@@ -13,7 +13,7 @@
"source": [
"This notebook shows how to use agents to interact with [Xorbits Pandas](https://doc.xorbits.io/en/latest/reference/pandas/index.html) dataframe and [Xorbits Numpy](https://doc.xorbits.io/en/latest/reference/numpy/index.html) ndarray. It is mostly optimized for question answering.\n",
"\n",
- "**NOTE: this agent calls the Python agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**"
+ "**NOTE: this agent calls the `Python` agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**"
]
},
{
@@ -734,9 +734,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
From 9963b32e5965e908d2deef798ba4ab4e336474a5 Mon Sep 17 00:00:00 2001
From: Harrison Chase
Date: Thu, 24 Aug 2023 06:42:42 -0700
Subject: [PATCH 105/143] Harrison/multi vector (#9700)
---
.../retrievers/multi_vector.ipynb | 366 ++++++++++++++++++
.../parent_document_retriever.ipynb | 3 +-
.../langchain/retrievers/__init__.py | 2 +
.../langchain/retrievers/multi_vector.py | 39 ++
.../retrievers/parent_document_retriever.py | 36 +-
5 files changed, 411 insertions(+), 35 deletions(-)
create mode 100644 docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
create mode 100644 libs/langchain/langchain/retrievers/multi_vector.py
diff --git a/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
new file mode 100644
index 000000000..4a7587562
--- /dev/null
+++ b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "d9172545",
+ "metadata": {},
+ "source": [
+ "# MultiVector Retriever\n",
+ "\n",
+ "It can often be beneficial to store multiple vectors per document. There are multiple use cases where this is beneficial. LangChain has a base `MultiVectorRetriever` which makes querying this type of setup easy. A lot of the complexity lies in how to create the multiple vectors per document. This notebook covers some of the common ways to create those vectors and use the `MultiVectorRetriever`.\n",
+ "\n",
+ "The methods to create multiple vectors per document include:\n",
+ "\n",
+ "- smaller chunks: split a document into smaller chunks, and embed those (this is ParentDocumentRetriever)\n",
+ "- summary: create a summary for each document, embed that along with (or instead of) the document\n",
+ "- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "eed469be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.retrievers.multi_vector import MultiVectorRetriever"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "18c1421a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.vectorstores import Chroma\n",
+ "from langchain.embeddings import OpenAIEmbeddings\n",
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "from langchain.storage import InMemoryStore\n",
+ "from langchain.document_loaders import TextLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "6d869496",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loaders = [\n",
+ " TextLoader('../../paul_graham_essay.txt'),\n",
+ " TextLoader('../../state_of_the_union.txt'),\n",
+ "]\n",
+ "docs = []\n",
+ "for l in loaders:\n",
+ " docs.extend(l.load())\n",
+ "text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)\n",
+ "docs = text_splitter.split_documents(docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa17beda",
+ "metadata": {},
+ "source": [
+ "## Smaller chunks\n",
+ "\n",
+ "Often times it can be useful to retrieve larger chunks of information, but embed smaller chunks. This allows for embeddings to capture the semantic meaning as closely as possible, but for as much context as possible to be passed downstream. NOTE: this is what the ParentDocumentRetriever does. Here we show what is going on under the hood."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0e7b6b45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The vectorstore to use to index the child chunks\n",
+ "vectorstore = Chroma(\n",
+ " collection_name=\"full_documents\",\n",
+ " embedding_function=OpenAIEmbeddings()\n",
+ ")\n",
+ "# The storage layer for the parent documents\n",
+ "store = InMemoryStore()\n",
+ "id_key = \"doc_id\"\n",
+ "# The retriever (empty to start)\n",
+ "retriever = MultiVectorRetriever(\n",
+ " vectorstore=vectorstore, \n",
+ " docstore=store, \n",
+ " id_key=id_key,\n",
+ ")\n",
+ "import uuid\n",
+ "doc_ids = [str(uuid.uuid4()) for _ in docs]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "72a36491",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The splitter to use to create smaller chunks\n",
+ "child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d23247d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sub_docs = []\n",
+ "for i, doc in enumerate(docs):\n",
+ " _id = doc_ids[i]\n",
+ " _sub_docs = child_text_splitter.split_documents([doc])\n",
+ " for _doc in _sub_docs:\n",
+ " _doc.metadata[id_key] = _id\n",
+ " sub_docs.extend(_sub_docs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "92ed5861",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "retriever.vectorstore.add_documents(sub_docs)\n",
+ "retriever.docstore.mset(list(zip(doc_ids, docs)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "8afed60c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': 'b4ca7817-e3fe-4103-ac81-574fb41439ef', 'source': '../../state_of_the_union.txt'})"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Vectorstore alone retrieves the small chunks\n",
+ "retriever.vectorstore.similarity_search(\"justice breyer\")[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3c9017f1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9874"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Retriever returns larger chunks\n",
+ "len(retriever.get_relevant_documents(\"justice breyer\")[0].page_content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6a7ae0d",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "Oftentimes a summary may be able to distill more accurately what a chunk is about, leading to better retrieval. Here we show how to create summaries, and then embed those."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "1433dff4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "from langchain.prompts import ChatPromptTemplate\n",
+ "from langchain.schema.output_parser import StrOutputParser\n",
+ "import uuid\n",
+ "from langchain.schema.document import Document"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "35b30390",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chain = (\n",
+ " {\"doc\": lambda x: x.page_content}\n",
+ " | ChatPromptTemplate.from_template(\"Summarize the following document:\\n\\n{doc}\")\n",
+ " | ChatOpenAI(max_retries=0)\n",
+ " | StrOutputParser()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "41a2a738",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "summaries = [chain.invoke(d) for d in docs]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "7ac5e4b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The vectorstore to use to index the child chunks\n",
+ "vectorstore = Chroma(\n",
+ " collection_name=\"summaries\",\n",
+ " embedding_function=OpenAIEmbeddings()\n",
+ ")\n",
+ "# The storage layer for the parent documents\n",
+ "store = InMemoryStore()\n",
+ "id_key = \"doc_id\"\n",
+ "# The retriever (empty to start)\n",
+ "retriever = MultiVectorRetriever(\n",
+ " vectorstore=vectorstore, \n",
+ " docstore=store, \n",
+ " id_key=id_key,\n",
+ ")\n",
+ "doc_ids = [str(uuid.uuid4()) for _ in docs]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "0d93309f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "summary_docs = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "6d5edf0d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "retriever.vectorstore.add_documents(summary_docs)\n",
+ "retriever.docstore.mset(list(zip(doc_ids, docs)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "299232d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sub_docs = vectorstore.similarity_search(\"justice breyer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "10e404c0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Document(page_content='The document discusses various topics and proposals put forth by the President in a State of the Union address. These include the nomination of a judge for the Supreme Court, securing the border and fixing the immigration system, advancing liberty and justice for women and LGBTQ+ individuals, passing bipartisan legislation, addressing the opioid epidemic and mental health issues, supporting veterans, and ending cancer. The President expresses optimism about the future of the country and emphasizes the strength of the American people.', metadata={'doc_id': '8c7a707d-615d-42d5-919d-bc5178dd1ae4'})"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sub_docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "e4cce5c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "retrieved_docs = retriever.get_relevant_documents(\"justice breyer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "c8570dbb",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9194"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(retrieved_docs[0].page_content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "097a5396",
+ "metadata": {},
+ "source": [
+ "## Hypothetical Queries\n",
+ "\n",
+ "An LLM can also be used to generate a list of hypothetical questions that could be asked of a particular document. These questions can then be embedded"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb b/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb
index 4b166bc14..206ecbfba 100644
--- a/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb
+++ b/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb
@@ -83,7 +83,6 @@
"outputs": [],
"source": [
"# This text splitter is used to create the child documents\n",
- "\n",
"child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n",
"# The vectorstore to use to index the child chunks\n",
"vectorstore = Chroma(\n",
@@ -432,7 +431,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.5"
+ "version": "3.10.1"
}
},
"nbformat": 4,
diff --git a/libs/langchain/langchain/retrievers/__init__.py b/libs/langchain/langchain/retrievers/__init__.py
index 1e5497c93..c666d9103 100644
--- a/libs/langchain/langchain/retrievers/__init__.py
+++ b/libs/langchain/langchain/retrievers/__init__.py
@@ -40,6 +40,7 @@ from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.retrievers.metal import MetalRetriever
from langchain.retrievers.milvus import MilvusRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
+from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain.retrievers.pinecone_hybrid_search import PineconeHybridSearchRetriever
from langchain.retrievers.pubmed import PubMedRetriever
@@ -92,4 +93,5 @@ __all__ = [
"WebResearchRetriever",
"EnsembleRetriever",
"ParentDocumentRetriever",
+ "MultiVectorRetriever",
]
diff --git a/libs/langchain/langchain/retrievers/multi_vector.py b/libs/langchain/langchain/retrievers/multi_vector.py
new file mode 100644
index 000000000..92d537189
--- /dev/null
+++ b/libs/langchain/langchain/retrievers/multi_vector.py
@@ -0,0 +1,39 @@
+from typing import List
+
+from pydantic import Field
+
+from langchain.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain.schema import BaseRetriever, BaseStore, Document
+from langchain.vectorstores import VectorStore
+
+
+class MultiVectorRetriever(BaseRetriever):
+ """Retrieve from a set of multiple embeddings for the same document."""
+
+ vectorstore: VectorStore
+ """The underlying vectorstore to use to store small chunks
+ and their embedding vectors"""
+ docstore: BaseStore[str, Document]
+ """The storage layer for the parent documents"""
+ id_key: str = "doc_id"
+ search_kwargs: dict = Field(default_factory=dict)
+ """Keyword arguments to pass to the search function."""
+
+ def _get_relevant_documents(
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+ ) -> List[Document]:
+ """Get documents relevant to a query.
+ Args:
+ query: String to find relevant documents for
+ run_manager: The callbacks handler to use
+ Returns:
+ List of relevant documents
+ """
+ sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
+ # We do this to maintain the order of the ids that are returned
+ ids = []
+ for d in sub_docs:
+ if d.metadata[self.id_key] not in ids:
+ ids.append(d.metadata[self.id_key])
+ docs = self.docstore.mget(ids)
+ return [d for d in docs if d is not None]
diff --git a/libs/langchain/langchain/retrievers/parent_document_retriever.py b/libs/langchain/langchain/retrievers/parent_document_retriever.py
index 6757ba3dd..dd5aa7206 100644
--- a/libs/langchain/langchain/retrievers/parent_document_retriever.py
+++ b/libs/langchain/langchain/retrievers/parent_document_retriever.py
@@ -1,16 +1,12 @@
import uuid
from typing import List, Optional
-from langchain.callbacks.manager import CallbackManagerForRetrieverRun
-from langchain.pydantic_v1 import Field
+from langchain.retrievers import MultiVectorRetriever
from langchain.schema.document import Document
-from langchain.schema.retriever import BaseRetriever
-from langchain.schema.storage import BaseStore
from langchain.text_splitter import TextSplitter
-from langchain.vectorstores.base import VectorStore
-class ParentDocumentRetriever(BaseRetriever):
+class ParentDocumentRetriever(MultiVectorRetriever):
"""Retrieve small chunks then retrieve their parent documents.
When splitting documents for retrieval, there are often conflicting desires:
@@ -59,40 +55,14 @@ class ParentDocumentRetriever(BaseRetriever):
)
"""
- vectorstore: VectorStore
- """The underlying vectorstore to use to store small chunks
- and their embedding vectors"""
- docstore: BaseStore[str, Document]
- """The storage layer for the parent documents"""
child_splitter: TextSplitter
"""The text splitter to use to create child documents."""
- id_key: str = "doc_id"
+
"""The key to use to track the parent id. This will be stored in the
metadata of child documents."""
parent_splitter: Optional[TextSplitter] = None
"""The text splitter to use to create parent documents.
If none, then the parent documents will be the raw documents passed in."""
- search_kwargs: dict = Field(default_factory=dict)
- """Keyword arguments to pass to the search function."""
-
- def _get_relevant_documents(
- self, query: str, *, run_manager: CallbackManagerForRetrieverRun
- ) -> List[Document]:
- """Get documents relevant to a query.
- Args:
- query: String to find relevant documents for
- run_manager: The callbacks handler to use
- Returns:
- List of relevant documents
- """
- sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
- # We do this to maintain the order of the ids that are returned
- ids = []
- for d in sub_docs:
- if d.metadata[self.id_key] not in ids:
- ids.append(d.metadata[self.id_key])
- docs = self.docstore.mget(ids)
- return [d for d in docs if d is not None]
def add_documents(
self,
From 20d2c0571ccffd305794a69e533d296a91cb2938 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Thu, 24 Aug 2023 16:05:10 +0200
Subject: [PATCH 106/143] Do not share executors between parent and child tasks
---
.../langchain/schema/runnable/base.py | 34 ++++++-------------
.../langchain/schema/runnable/config.py | 16 ++-------
.../runnable/__snapshots__/test_runnable.ambr | 3 +-
3 files changed, 15 insertions(+), 38 deletions(-)
diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py
index a130dc62b..bdbd7fc69 100644
--- a/libs/langchain/langchain/schema/runnable/base.py
+++ b/libs/langchain/langchain/schema/runnable/base.py
@@ -117,13 +117,7 @@ class Runnable(Generic[Input, Output], ABC):
return [self.invoke(inputs[0], configs[0], **kwargs)]
with get_executor_for_config(configs[0]) as executor:
- return list(
- executor.map(
- partial(self.invoke, **kwargs),
- inputs,
- (patch_config(c, executor=executor) for c in configs),
- )
- )
+ return list(executor.map(partial(self.invoke, **kwargs), inputs, configs))
async def abatch(
self,
@@ -852,18 +846,15 @@ class RunnableSequence(Serializable, Runnable[Input, Output]):
# invoke
try:
- with get_executor_for_config(configs[0]) as executor:
- for step in self.steps:
- inputs = step.batch(
- inputs,
- [
- # each step a child run of the corresponding root run
- patch_config(
- config, callbacks=rm.get_child(), executor=executor
- )
- for rm, config in zip(run_managers, configs)
- ],
- )
+ for step in self.steps:
+ inputs = step.batch(
+ inputs,
+ [
+ # each step a child run of the corresponding root run
+ patch_config(config, callbacks=rm.get_child())
+ for rm, config in zip(run_managers, configs)
+ ],
+ )
# finish the root runs
except (KeyboardInterrupt, Exception) as e:
for rm in run_managers:
@@ -1152,7 +1143,6 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
config,
deep_copy_locals=True,
callbacks=run_manager.get_child(),
- executor=executor,
),
)
for step in steps.values()
@@ -1219,9 +1209,7 @@ class RunnableMap(Serializable, Runnable[Input, Dict[str, Any]]):
name,
step.transform(
input_copies.pop(),
- patch_config(
- config, callbacks=run_manager.get_child(), executor=executor
- ),
+ patch_config(config, callbacks=run_manager.get_child()),
),
)
for name, step in steps.items()
diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py
index a431fb635..b97d90441 100644
--- a/libs/langchain/langchain/schema/runnable/config.py
+++ b/libs/langchain/langchain/schema/runnable/config.py
@@ -42,12 +42,6 @@ class RunnableConfig(TypedDict, total=False):
ThreadPoolExecutor's default. This is ignored if an executor is provided.
"""
- executor: Executor
- """
- Externally-managed executor to use for parallel calls. If not provided, a new
- ThreadPoolExecutor will be created.
- """
-
recursion_limit: int
"""
Maximum number of times a call can recurse. If not provided, defaults to 10.
@@ -72,7 +66,6 @@ def patch_config(
*,
deep_copy_locals: bool = False,
callbacks: Optional[BaseCallbackManager] = None,
- executor: Optional[Executor] = None,
recursion_limit: Optional[int] = None,
) -> RunnableConfig:
config = ensure_config(config)
@@ -80,8 +73,6 @@ def patch_config(
config["_locals"] = deepcopy(config["_locals"])
if callbacks is not None:
config["callbacks"] = callbacks
- if executor is not None:
- config["executor"] = executor
if recursion_limit is not None:
config["recursion_limit"] = recursion_limit
return config
@@ -111,8 +102,5 @@ def get_async_callback_manager_for_config(
@contextmanager
def get_executor_for_config(config: RunnableConfig) -> Generator[Executor, None, None]:
- if config.get("executor"):
- yield config["executor"]
- else:
- with ThreadPoolExecutor(max_workers=config.get("max_concurrency")) as executor:
- yield executor
+ with ThreadPoolExecutor(max_workers=config.get("max_concurrency")) as executor:
+ yield executor
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
index c48d4edbd..fcb621fe8 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
+++ b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
@@ -2081,7 +2081,8 @@
"stop": [
"Thought:"
]
- }
+ },
+ "config": {}
}
},
"llm": {
From 78ffcdd9a9a85782e1677e324bc28a477d30e245 Mon Sep 17 00:00:00 2001
From: Nuno Campos
Date: Thu, 24 Aug 2023 16:09:38 +0200
Subject: [PATCH 107/143] Lint
---
.../schema/runnable/__snapshots__/test_runnable.ambr | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
index fcb621fe8..c48d4edbd 100644
--- a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
+++ b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr
@@ -2081,8 +2081,7 @@
"stop": [
"Thought:"
]
- },
- "config": {}
+ }
}
},
"llm": {
From 7cf5c582d27d35d60efb2aefe964cce21ccf11b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=86=E7=A9=BA?= <568250549@qq.com>
Date: Thu, 24 Aug 2023 22:23:48 +0800
Subject: [PATCH 108/143] Added a link to the dependencies document (#9703)
---
docs/extras/ecosystem/dependents.mdx | 1 +
1 file changed, 1 insertion(+)
diff --git a/docs/extras/ecosystem/dependents.mdx b/docs/extras/ecosystem/dependents.mdx
index b21895910..2de471678 100644
--- a/docs/extras/ecosystem/dependents.mdx
+++ b/docs/extras/ecosystem/dependents.mdx
@@ -51,6 +51,7 @@ Dependents stats for `langchain-ai/langchain`
|[e2b-dev/e2b](https://github.com/e2b-dev/e2b) | 5365 |
|[mage-ai/mage-ai](https://github.com/mage-ai/mage-ai) | 5352 |
|[wenda-LLM/wenda](https://github.com/wenda-LLM/wenda) | 5192 |
+|[LangChain-Chinese-Getting-Started-Guide](https://github.com/liaokongVFX/LangChain-Chinese-Getting-Started-Guide) | 5129 |
|[zilliztech/GPTCache](https://github.com/zilliztech/GPTCache) | 4993 |
|[GreyDGL/PentestGPT](https://github.com/GreyDGL/PentestGPT) | 4831 |
|[zauberzeug/nicegui](https://github.com/zauberzeug/nicegui) | 4824 |
From 6bedfdf25aed14c87e18daebcc4dee4ccb7f777c Mon Sep 17 00:00:00 2001
From: Patrick Loeber <50772274+patrickloeber@users.noreply.github.com>
Date: Thu, 24 Aug 2023 16:24:53 +0200
Subject: [PATCH 109/143] Fix docs for AssemblyAIAudioTranscriptLoader (shorter
import path) (#9687)
Uses the shorter import path
`from langchain.document_loaders import` instead of the full path
`from langchain.document_loaders.assemblyai`
Applies those changes to the docs and the unit test.
See #9667 that adds this new loader.
---
docs/extras/integrations/document_loaders/assemblyai.ipynb | 7 ++-----
.../tests/unit_tests/document_loaders/test_assemblyai.py | 6 ++----
2 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/docs/extras/integrations/document_loaders/assemblyai.ipynb b/docs/extras/integrations/document_loaders/assemblyai.ipynb
index 33fdef929..80d60df48 100644
--- a/docs/extras/integrations/document_loaders/assemblyai.ipynb
+++ b/docs/extras/integrations/document_loaders/assemblyai.ipynb
@@ -53,7 +53,7 @@
"metadata": {},
"outputs": [],
"source": [
- "from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader\n",
+ "from langchain.document_loaders import AssemblyAIAudioTranscriptLoader\n",
"\n",
"audio_file = \"https://storage.googleapis.com/aai-docs-samples/nbc.mp3\"\n",
"# or a local file path: audio_file = \"./nbc.mp3\"\n",
@@ -148,10 +148,7 @@
"metadata": {},
"outputs": [],
"source": [
- "from langchain.document_loaders.assemblyai import (\n",
- " AssemblyAIAudioTranscriptLoader,\n",
- " TranscriptFormat,\n",
- ")\n",
+ "from langchain.document_loaders.assemblyai import TranscriptFormat\n",
"\n",
"loader = AssemblyAIAudioTranscriptLoader(\n",
" file_path=\"./your_file.mp3\",\n",
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
index a9b6112e7..550a2c0d1 100644
--- a/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py
@@ -1,10 +1,8 @@
import pytest
from pytest_mock import MockerFixture
-from langchain.document_loaders.assemblyai import (
- AssemblyAIAudioTranscriptLoader,
- TranscriptFormat,
-)
+from langchain.document_loaders import AssemblyAIAudioTranscriptLoader
+from langchain.document_loaders.assemblyai import TranscriptFormat
@pytest.mark.requires("assemblyai")
From f5ea72579674f0b1e5cbfecacf67dc7d78b87c81 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 07:46:15 -0700
Subject: [PATCH 110/143] bump 272 (#9704)
---
libs/langchain/pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index 60d8afb5a..a35570213 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain"
-version = "0.0.271"
+version = "0.0.272"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"
From cf792891f1fc4643dc47001e1002bb6c68e9b7a0 Mon Sep 17 00:00:00 2001
From: Leonid Ganeline
Date: Thu, 24 Aug 2023 09:01:52 -0700
Subject: [PATCH 111/143] =?UTF-8?q?=F0=9F=93=96=20docs:=20compact=20api=20?=
=?UTF-8?q?reference=20(#8651)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Updated design of the "API Reference" text
Here is an example of the current format:

It changed to
`langchain.retrievers.ElasticSearchBM25Retriever` format. The same
format as it is in the API Reference Toc.
It also resembles code:
`from langchain.retrievers import ElasticSearchBM25Retriever` (namespace
THEN class_name)
Current format is
`ElasticSearchBM25Retriever from langchain.retrievers` (class_name THEN
namespace)
This change is in line with other formats and improves readability.
@baskaryan
---
docs/docs_skeleton/src/theme/CodeBlock/index.js | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/docs/docs_skeleton/src/theme/CodeBlock/index.js b/docs/docs_skeleton/src/theme/CodeBlock/index.js
index e97503cd8..84da0b86e 100644
--- a/docs/docs_skeleton/src/theme/CodeBlock/index.js
+++ b/docs/docs_skeleton/src/theme/CodeBlock/index.js
@@ -24,8 +24,7 @@ function Imports({ imports }) {
))}
From c37be7f5fb13b84bcb05091649a7e21cb8f16977 Mon Sep 17 00:00:00 2001
From: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Date: Thu, 24 Aug 2023 11:03:35 -0700
Subject: [PATCH 112/143] Add Code LLaMA to code QA use case (#9713)
Use [Ollama integration](https://ollama.ai/blog/run-code-llama-locally).
---
.../extras/use_cases/code_understanding.ipynb | 94 ++++++++++++++++---
1 file changed, 81 insertions(+), 13 deletions(-)
diff --git a/docs/extras/use_cases/code_understanding.ipynb b/docs/extras/use_cases/code_understanding.ipynb
index a649d7409..ffd14b502 100644
--- a/docs/extras/use_cases/code_understanding.ipynb
+++ b/docs/extras/use_cases/code_understanding.ipynb
@@ -66,7 +66,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@@ -100,7 +100,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 39,
"metadata": {},
"outputs": [
{
@@ -109,7 +109,7 @@
"1293"
]
},
- "execution_count": 14,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -139,7 +139,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 40,
"metadata": {},
"outputs": [
{
@@ -148,7 +148,7 @@
"3748"
]
},
- "execution_count": 17,
+ "execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -187,7 +187,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
@@ -195,7 +195,7 @@
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))\n",
"retriever = db.as_retriever(\n",
- " search_type=\"mmr\", # Also test \"similarity\"\n",
+ " search_type=\"mmr\", # Also test \"similarity\"\n",
" search_kwargs={\"k\": 8},\n",
")"
]
@@ -217,7 +217,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@@ -231,22 +231,22 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'To load a source code as documents for a QA over code, you can use the `CodeLoader` class. This class allows you to load source code files and split them into classes and functions.\\n\\nHere is an example of how to use the `CodeLoader` class:\\n\\n```python\\nfrom langchain.document_loaders.code import CodeLoader\\n\\n# Specify the path to the source code file\\ncode_file_path = \"path/to/code/file.py\"\\n\\n# Create an instance of the CodeLoader class\\ncode_loader = CodeLoader(code_file_path)\\n\\n# Load the code as documents\\ndocuments = code_loader.load()\\n\\n# Iterate over the documents\\nfor document in documents:\\n # Access the class or function name\\n name = document.metadata[\"name\"]\\n \\n # Access the code content\\n code = document.page_content\\n \\n # Process the code as needed\\n # ...\\n```\\n\\nIn the example above, `code_file_path` should be replaced with the actual path to your source code file. The `load()` method of the `CodeLoader` class will return a list of `Document` objects, where each document represents a class or function in the source code. You can access the class or function name using the `metadata[\"name\"]` attribute, and the code content using the `page_content` attribute of each `Document` object.\\n\\nYou can then process the code as needed for your QA task.'"
+ "'To initialize a ReAct agent, you need to follow these steps:\\n\\n1. Initialize a language model `llm` of type `BaseLanguageModel`.\\n\\n2. Initialize a document store `docstore` of type `Docstore`.\\n\\n3. Create a `DocstoreExplorer` with the initialized `docstore`. The `DocstoreExplorer` is used to search for and look up terms in the document store.\\n\\n4. Create an array of `Tool` objects. The `Tool` objects represent the actions that the agent can perform. In the case of `ReActDocstoreAgent`, the tools must be \"Search\" and \"Lookup\" with their corresponding functions from the `DocstoreExplorer`.\\n\\n5. Initialize the `ReActDocstoreAgent` using the `from_llm_and_tools` method with the `llm` (language model) and `tools` as parameters.\\n\\n6. Initialize the `ReActChain` (which is the `AgentExecutor`) using the `ReActDocstoreAgent` and `tools` as parameters.\\n\\nHere is an example of how to do this:\\n\\n```python\\nfrom langchain import ReActChain, OpenAI\\nfrom langchain.docstore.base import Docstore\\nfrom langchain.docstore.document import Document\\nfrom langchain.tools.base import BaseTool\\n\\n# Initialize the LLM and a docstore\\nllm = OpenAI()\\ndocstore = Docstore()\\n\\ndocstore_explorer = DocstoreExplorer(docstore)\\ntools = [\\n Tool(\\n name=\"Search\",\\n func=docstore_explorer.search,\\n description=\"Search for a term in the docstore.\",\\n ),\\n Tool(\\n name=\"Lookup\",\\n func=docstore_explorer.lookup,\\n description=\"Lookup a term in the docstore.\",\\n ),\\n]\\nagent = ReActDocstoreAgent.from_llm_and_tools(llm, tools)\\nreact = ReActChain(agent=agent, tools=tools)\\n```\\n\\nKeep in mind that this is a simplified example and you might need to adapt it to your specific needs.'"
]
},
- "execution_count": 30,
+ "execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "question = \"How can I load a source code as documents, for a QA over code, spliting the code in classes and functions?\"\n",
+ "question = \"How can I initialize a ReAct agent?\"\n",
"result = qa(question)\n",
"result['answer']"
]
@@ -328,6 +328,74 @@
"\n",
""
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Private chat\n",
+ "\n",
+ "We can use [Code LLaMA](https://about.fb.com/news/2023/08/code-llama-ai-for-coding/) via the Ollama integration.\n",
+ "\n",
+ "`ollama pull codellama:7b-instruct`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.llms import Ollama\n",
+ "from langchain.callbacks.manager import CallbackManager\n",
+ "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler \n",
+ "llm = Ollama(model=\"codellama:7b-instruct\", \n",
+ " callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))\n",
+ "memory = ConversationSummaryMemory(llm=llm,memory_key=\"chat_history\",return_messages=True)\n",
+ "qa_llama=ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " \"How can I initialize a ReAct agent?\" To initialize a ReAct agent, you can use the `ReActAgent.from_llm_and_tools()` class method. This method takes two arguments: the LLM and a list of tools.\n",
+ "Here is an example of how to initialize a ReAct agent with the OpenAI language model and the \"Search\" tool:\n",
+ "from langchain.agents.mrkl.base import ZeroShotAgent\n",
+ "\n",
+ "agent = ReActDocstoreAgent.from_llm_and_tools(OpenAIFunctionsAgent(), [Tool(\"Search\")]])\n",
+ "\n",
+ " The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential."
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "' To initialize a ReAct agent, you can use the `ReActAgent.from_llm_and_tools()` class method. This method takes two arguments: the LLM and a list of tools.\\nHere is an example of how to initialize a ReAct agent with the OpenAI language model and the \"Search\" tool:\\nfrom langchain.agents.mrkl.base import ZeroShotAgent\\n\\nagent = ReActDocstoreAgent.from_llm_and_tools(OpenAIFunctionsAgent(), [Tool(\"Search\")]])\\n\\n'"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "question = \"How can I initialize a ReAct agent?\"\n",
+ "result = qa_llama(question)\n",
+ "result['answer']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can view the [LangSmith trace](https://smith.langchain.com/public/fd24c734-e365-4a09-b883-cdbc7dcfa582/r) to sanity check the result relative to what was retrieved."
+ ]
}
],
"metadata": {
From dacf96895a7504db1bcacd2c1f4c823f529c52bc Mon Sep 17 00:00:00 2001
From: Tomaz Bratanic
Date: Thu, 24 Aug 2023 20:50:38 +0200
Subject: [PATCH 113/143] Add the option to use separate LLMs for GraphCypherQA
chain (#9689)
The Graph Chains are different in the way that it uses two LLMChains
instead of one like the retrievalQA chains. Therefore, sometimes you
want to use different LLM to generate the database query and to generate
the final answer.
This feature would make it more convenient to use different LLMs in the
same chain.
I have also renamed the Graph DB QA Chain to Neo4j DB QA Chain in the
documentation only as it is used only for Neo4j. The naming was
ambigious as it was the first graphQA chain added and wasn't sure how do
you want to spin it.
---
docs/api_reference/guide_imports.json | 6 +-
.../more/graph/graph_cypher_qa.ipynb | 167 ++++++++++++++++--
.../langchain/chains/graph_qa/cypher.py | 19 +-
3 files changed, 174 insertions(+), 18 deletions(-)
diff --git a/docs/api_reference/guide_imports.json b/docs/api_reference/guide_imports.json
index 832c03f99..176c686e5 100644
--- a/docs/api_reference/guide_imports.json
+++ b/docs/api_reference/guide_imports.json
@@ -341,7 +341,7 @@
"HugeGraph QA Chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_hugegraph_qa",
"GraphSparqlQAChain": "https://python.langchain.com/docs/use_cases/more/graph/graph_sparql_qa",
"ArangoDB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_arangodb_qa",
- "Graph DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa",
+ "Neo4j DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa",
"How to use a SmartLLMChain": "https://python.langchain.com/docs/use_cases/more/self_check/smart_llm",
"Multi-Agent Simulated Environment: Petting Zoo": "https://python.langchain.com/docs/use_cases/agent_simulations/petting_zoo",
"Multi-agent decentralized speaker selection": "https://python.langchain.com/docs/use_cases/agent_simulations/multiagent_bidding",
@@ -3202,10 +3202,10 @@
"Graph QA": "https://python.langchain.com/docs/use_cases/more/graph/graph_qa"
},
"GraphCypherQAChain": {
- "Graph DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa"
+ "Neo4j DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa"
},
"Neo4jGraph": {
- "Graph DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa"
+ "Neo4j DB QA chain": "https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa"
},
"LLMBashChain": {
"Bash chain": "https://python.langchain.com/docs/use_cases/more/code_writing/llm_bash"
diff --git a/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb b/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb
index f6f9ca818..84adde72b 100644
--- a/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb
+++ b/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb
@@ -5,7 +5,7 @@
"id": "c94240f5",
"metadata": {},
"source": [
- "# Graph DB QA chain\n",
+ "# Neo4j DB QA chain\n",
"\n",
"This notebook shows how to use LLMs to provide a natural language interface to a graph database you can query with the Cypher query language."
]
@@ -177,7 +177,7 @@
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
- "\u001b[32;1m\u001b[1;3m[{'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
+ "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@@ -185,7 +185,7 @@
{
"data": {
"text/plain": [
- "'Val Kilmer, Anthony Edwards, Meg Ryan, and Tom Cruise played in Top Gun.'"
+ "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
]
},
"execution_count": 7,
@@ -236,7 +236,7 @@
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
- "\u001b[32;1m\u001b[1;3m[{'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}]\u001b[0m\n",
+ "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
@@ -244,7 +244,7 @@
{
"data": {
"text/plain": [
- "'Val Kilmer and Anthony Edwards played in Top Gun.'"
+ "'Tom Cruise and Val Kilmer played in Top Gun.'"
]
},
"execution_count": 9,
@@ -294,11 +294,11 @@
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
"RETURN a.name\u001b[0m\n",
"Full Context:\n",
- "\u001b[32;1m\u001b[1;3m[{'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
+ "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n",
- "Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\\nRETURN a.name\"}, {'context': [{'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Tom Cruise'}]}]\n",
- "Final answer: Val Kilmer, Anthony Edwards, Meg Ryan, and Tom Cruise played in Top Gun.\n"
+ "Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\\nRETURN a.name\"}, {'context': [{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]}]\n",
+ "Final answer: Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.\n"
]
}
],
@@ -352,10 +352,10 @@
{
"data": {
"text/plain": [
- "[{'a.name': 'Val Kilmer'},\n",
+ "[{'a.name': 'Tom Cruise'},\n",
+ " {'a.name': 'Val Kilmer'},\n",
" {'a.name': 'Anthony Edwards'},\n",
- " {'a.name': 'Meg Ryan'},\n",
- " {'a.name': 'Tom Cruise'}]"
+ " {'a.name': 'Meg Ryan'}]"
]
},
"execution_count": 13,
@@ -367,10 +367,153 @@
"chain.run(\"Who played in Top Gun?\")"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "f01dfb72-24ec-4ae7-883a-ee6646889b59",
+ "metadata": {},
+ "source": [
+ "## Add examples in the Cypher generation prompt\n",
+ "You can define the Cypher statement you want the LLM to generate for particular questions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "59baeb88-adfa-4c26-8334-fcbff3a98efb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.prompts.prompt import PromptTemplate\n",
+ "\n",
+ "\n",
+ "CYPHER_GENERATION_TEMPLATE = \"\"\"Task:Generate Cypher statement to query a graph database.\n",
+ "Instructions:\n",
+ "Use only the provided relationship types and properties in the schema.\n",
+ "Do not use any other relationship types or properties that are not provided.\n",
+ "Schema:\n",
+ "{schema}\n",
+ "Note: Do not include any explanations or apologies in your responses.\n",
+ "Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.\n",
+ "Do not include any text except the generated Cypher statement.\n",
+ "Examples: Here are a few examples of generated Cypher statements for particular questions:\n",
+ "# How many people played in Top Gun?\n",
+ "MATCH (m:Movie {{title:\"Top Gun\"}})<-[:ACTED_IN]-()\n",
+ "RETURN count(*) AS numberOfActors\n",
+ "\n",
+ "The question is:\n",
+ "{question}\"\"\"\n",
+ "\n",
+ "CYPHER_GENERATION_PROMPT = PromptTemplate(\n",
+ " input_variables=[\"schema\", \"question\"], template=CYPHER_GENERATION_TEMPLATE\n",
+ ")\n",
+ "\n",
+ "chain = GraphCypherQAChain.from_llm(\n",
+ " ChatOpenAI(temperature=0), graph=graph, verbose=True, cypher_prompt=CYPHER_GENERATION_PROMPT\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "47c64027-cf42-493a-9c76-2d10ba753728",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
+ "Generated Cypher:\n",
+ "\u001b[32;1m\u001b[1;3mMATCH (m:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-(:Actor)\n",
+ "RETURN count(*) AS numberOfActors\u001b[0m\n",
+ "Full Context:\n",
+ "\u001b[32;1m\u001b[1;3m[{'numberOfActors': 4}]\u001b[0m\n",
+ "\n",
+ "\u001b[1m> Finished chain.\u001b[0m\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'Four people played in Top Gun.'"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chain.run(\"How many people played in Top Gun?\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e721cad-aa87-4526-9231-2dfc0e365939",
+ "metadata": {},
+ "source": [
+ "## Use separate LLMs for Cypher and answer generation\n",
+ "You can use the `cypher_llm` and `qa_llm` parameters to define different llms"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "6f9becc2-f579-45bf-9b50-2ce02bde92da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chain = GraphCypherQAChain.from_llm(\n",
+ " graph=graph,\n",
+ " cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\"),\n",
+ " qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\"),\n",
+ " verbose=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "ff18e3e3-3402-4683-aec4-a19898f23ca1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
+ "Generated Cypher:\n",
+ "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
+ "RETURN a.name\u001b[0m\n",
+ "Full Context:\n",
+ "\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
+ "\n",
+ "\u001b[1m> Finished chain.\u001b[0m\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chain.run(\"Who played in Top Gun?\")"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
- "id": "74d0a36f",
+ "id": "48ff7cf8-18a3-43d7-8cb1-c1b91744608d",
"metadata": {},
"outputs": [],
"source": []
diff --git a/libs/langchain/langchain/chains/graph_qa/cypher.py b/libs/langchain/langchain/chains/graph_qa/cypher.py
index 015ff5f47..82c85ef1c 100644
--- a/libs/langchain/langchain/chains/graph_qa/cypher.py
+++ b/libs/langchain/langchain/chains/graph_qa/cypher.py
@@ -73,15 +73,28 @@ class GraphCypherQAChain(Chain):
@classmethod
def from_llm(
cls,
- llm: BaseLanguageModel,
+ llm: Optional[BaseLanguageModel] = None,
*,
qa_prompt: BasePromptTemplate = CYPHER_QA_PROMPT,
cypher_prompt: BasePromptTemplate = CYPHER_GENERATION_PROMPT,
+ cypher_llm: Optional[BaseLanguageModel] = None,
+ qa_llm: Optional[BaseLanguageModel] = None,
**kwargs: Any,
) -> GraphCypherQAChain:
"""Initialize from LLM."""
- qa_chain = LLMChain(llm=llm, prompt=qa_prompt)
- cypher_generation_chain = LLMChain(llm=llm, prompt=cypher_prompt)
+
+ if not cypher_llm and not llm:
+ raise ValueError("Either `llm` or `cypher_llm` parameters must be provided")
+ if not qa_llm and not llm:
+ raise ValueError("Either `llm` or `qa_llm` parameters must be provided")
+ if cypher_llm and qa_llm and llm:
+ raise ValueError(
+ "You can specify up to two of 'cypher_llm', 'qa_llm'"
+ ", and 'llm', but not all three simultaneously."
+ )
+
+ qa_chain = LLMChain(llm=qa_llm or llm, prompt=qa_prompt)
+ cypher_generation_chain = LLMChain(llm=cypher_llm or llm, prompt=cypher_prompt)
return cls(
qa_chain=qa_chain,
From 22b6549a34b6b0b4b55afa2df6510e49428d0f55 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 13:53:50 -0700
Subject: [PATCH 114/143] sort api classes (#9710)
---
docs/api_reference/create_api_rst.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/api_reference/create_api_rst.py b/docs/api_reference/create_api_rst.py
index 23fa327af..ee32f0d4c 100644
--- a/docs/api_reference/create_api_rst.py
+++ b/docs/api_reference/create_api_rst.py
@@ -228,7 +228,7 @@ Classes
:toctree: {module}
"""
- for class_ in classes:
+ for class_ in sorted(classes, key=lambda c: c["qualified_name"]):
if not class_["is_public"]:
continue
From 2bcf581a2302693e5df9d185ab64e62bf0bd9bc5 Mon Sep 17 00:00:00 2001
From: Andrew White
Date: Thu, 24 Aug 2023 17:11:30 -0400
Subject: [PATCH 115/143] Added search parameters to qdrant
max_marginal_relevance_search (#7745)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Adds the qdrant search filter/params to the
`max_marginal_relevance_search` method, which is present on others. I
did not add `offset` for pagination, because it's behavior would be
ambiguous in this setting (since we fetch extra and down-select).
---------
Co-authored-by: Bagatur
Co-authored-by: Kacper Łukawski
---
.../langchain/vectorstores/qdrant.py | 306 +++++++++++++++---
.../qdrant/test_max_marginal_relevance.py | 19 ++
2 files changed, 277 insertions(+), 48 deletions(-)
diff --git a/libs/langchain/langchain/vectorstores/qdrant.py b/libs/langchain/langchain/vectorstores/qdrant.py
index 7b9d9869c..cdc5bea8e 100644
--- a/libs/langchain/langchain/vectorstores/qdrant.py
+++ b/libs/langchain/langchain/vectorstores/qdrant.py
@@ -265,6 +265,8 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of Documents most similar to the query.
@@ -339,6 +341,8 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of documents most similar to the query text and distance for each.
@@ -394,6 +398,9 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to
+ QdrantClient.async_grpc_points.Search().
Returns:
List of documents most similar to the query text and distance for each.
@@ -448,6 +455,8 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of Documents most similar to the query.
@@ -504,6 +513,9 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to
+ QdrantClient.async_grpc_points.Search().
Returns:
List of Documents most similar to the query.
@@ -559,6 +571,8 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of documents most similar to the query text and distance for each.
@@ -601,6 +615,56 @@ class Qdrant(VectorStore):
for result in results
]
+ async def _asearch_with_score_by_vector(
+ self,
+ embedding: List[float],
+ *,
+ k: int = 4,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ offset: int = 0,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
+ with_vectors: bool = False,
+ **kwargs: Any,
+ ) -> Any:
+ """Return results most similar to embedding vector."""
+ from qdrant_client import grpc # noqa
+ from qdrant_client.conversions.conversion import RestToGrpc
+ from qdrant_client.http import models as rest
+
+ if filter is not None and isinstance(filter, dict):
+ warnings.warn(
+ "Using dict as a `filter` is deprecated. Please use qdrant-client "
+ "filters directly: "
+ "https://qdrant.tech/documentation/concepts/filtering/",
+ DeprecationWarning,
+ )
+ qdrant_filter = self._qdrant_filter_from_dict(filter)
+ else:
+ qdrant_filter = filter
+
+ if qdrant_filter is not None and isinstance(qdrant_filter, rest.Filter):
+ qdrant_filter = RestToGrpc.convert_filter(qdrant_filter)
+
+ response = await self.client.async_grpc_points.Search(
+ grpc.SearchPoints(
+ collection_name=self.collection_name,
+ vector_name=self.vector_name,
+ vector=embedding,
+ filter=qdrant_filter,
+ params=search_params,
+ limit=k,
+ offset=offset,
+ with_payload=grpc.WithPayloadSelector(enable=True),
+ with_vectors=grpc.WithVectorsSelector(enable=with_vectors),
+ score_threshold=score_threshold,
+ read_consistency=consistency,
+ **kwargs,
+ )
+ )
+ return response
+
@sync_call_fallback
async def asimilarity_search_with_score_by_vector(
self,
@@ -641,43 +705,22 @@ class Qdrant(VectorStore):
- 'quorum' - query the majority of replicas, return values present in
all of them
- 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to
+ QdrantClient.async_grpc_points.Search().
Returns:
List of documents most similar to the query text and distance for each.
"""
- from qdrant_client import grpc # noqa
- from qdrant_client.conversions.conversion import RestToGrpc
- from qdrant_client.http import models as rest
-
- if filter is not None and isinstance(filter, dict):
- warnings.warn(
- "Using dict as a `filter` is deprecated. Please use qdrant-client "
- "filters directly: "
- "https://qdrant.tech/documentation/concepts/filtering/",
- DeprecationWarning,
- )
- qdrant_filter = self._qdrant_filter_from_dict(filter)
- else:
- qdrant_filter = filter
-
- if qdrant_filter is not None and isinstance(qdrant_filter, rest.Filter):
- qdrant_filter = RestToGrpc.convert_filter(qdrant_filter)
-
- response = await self.client.async_grpc_points.Search(
- grpc.SearchPoints(
- collection_name=self.collection_name,
- vector_name=self.vector_name,
- vector=embedding,
- filter=qdrant_filter,
- params=search_params,
- limit=k,
- offset=offset,
- with_payload=grpc.WithPayloadSelector(enable=True),
- with_vectors=grpc.WithVectorsSelector(enable=False),
- score_threshold=score_threshold,
- read_consistency=consistency,
- **kwargs,
- )
+ response = await self._asearch_with_score_by_vector(
+ embedding,
+ k=k,
+ filter=filter,
+ search_params=search_params,
+ offset=offset,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
return [
@@ -696,6 +739,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@@ -712,12 +759,41 @@ class Qdrant(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
+ filter: Filter by metadata. Defaults to None.
+ search_params: Additional search params
+ score_threshold:
+ Define a minimal score threshold for the result.
+ If defined, less similar results will not be returned.
+ Score of the returned result might be higher or smaller than the
+ threshold depending on the Distance function used.
+ E.g. for cosine similarity only higher scores will be returned.
+ consistency:
+ Read consistency of the search. Defines how many replicas should be
+ queried before returning the result.
+ Values:
+ - int - number of replicas to query, values should present in all
+ queried replicas
+ - 'majority' - query all replicas, but return values present in the
+ majority of replicas
+ - 'quorum' - query the majority of replicas, return values present in
+ all of them
+ - 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of Documents selected by maximal marginal relevance.
"""
query_embedding = self._embed_query(query)
return self.max_marginal_relevance_search_by_vector(
- query_embedding, k, fetch_k, lambda_mult, **kwargs
+ query_embedding,
+ k=k,
+ fetch_k=fetch_k,
+ lambda_mult=lambda_mult,
+ filter=filter,
+ search_params=search_params,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
@sync_call_fallback
@@ -727,6 +803,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@@ -743,12 +823,42 @@ class Qdrant(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
+ filter: Filter by metadata. Defaults to None.
+ search_params: Additional search params
+ score_threshold:
+ Define a minimal score threshold for the result.
+ If defined, less similar results will not be returned.
+ Score of the returned result might be higher or smaller than the
+ threshold depending on the Distance function used.
+ E.g. for cosine similarity only higher scores will be returned.
+ consistency:
+ Read consistency of the search. Defines how many replicas should be
+ queried before returning the result.
+ Values:
+ - int - number of replicas to query, values should present in all
+ queried replicas
+ - 'majority' - query all replicas, but return values present in the
+ majority of replicas
+ - 'quorum' - query the majority of replicas, return values present in
+ all of them
+ - 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to
+ QdrantClient.async_grpc_points.Search().
Returns:
List of Documents selected by maximal marginal relevance.
"""
query_embedding = self._embed_query(query)
return await self.amax_marginal_relevance_search_by_vector(
- query_embedding, k, fetch_k, lambda_mult, **kwargs
+ query_embedding,
+ k=k,
+ fetch_k=fetch_k,
+ lambda_mult=lambda_mult,
+ filter=filter,
+ search_params=search_params,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
def max_marginal_relevance_search_by_vector(
@@ -757,6 +867,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@@ -772,11 +886,40 @@ class Qdrant(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
+ filter: Filter by metadata. Defaults to None.
+ search_params: Additional search params
+ score_threshold:
+ Define a minimal score threshold for the result.
+ If defined, less similar results will not be returned.
+ Score of the returned result might be higher or smaller than the
+ threshold depending on the Distance function used.
+ E.g. for cosine similarity only higher scores will be returned.
+ consistency:
+ Read consistency of the search. Defines how many replicas should be
+ queried before returning the result.
+ Values:
+ - int - number of replicas to query, values should present in all
+ queried replicas
+ - 'majority' - query all replicas, but return values present in the
+ majority of replicas
+ - 'quorum' - query the majority of replicas, return values present in
+ all of them
+ - 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of Documents selected by maximal marginal relevance.
"""
results = self.max_marginal_relevance_search_with_score_by_vector(
- embedding=embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, **kwargs
+ embedding,
+ k=k,
+ fetch_k=fetch_k,
+ lambda_mult=lambda_mult,
+ filter=filter,
+ search_params=search_params,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
return list(map(itemgetter(0), results))
@@ -787,6 +930,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@@ -801,12 +948,42 @@ class Qdrant(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
+ filter: Filter by metadata. Defaults to None.
+ search_params: Additional search params
+ score_threshold:
+ Define a minimal score threshold for the result.
+ If defined, less similar results will not be returned.
+ Score of the returned result might be higher or smaller than the
+ threshold depending on the Distance function used.
+ E.g. for cosine similarity only higher scores will be returned.
+ consistency:
+ Read consistency of the search. Defines how many replicas should be
+ queried before returning the result.
+ Values:
+ - int - number of replicas to query, values should present in all
+ queried replicas
+ - 'majority' - query all replicas, but return values present in the
+ majority of replicas
+ - 'quorum' - query the majority of replicas, return values present in
+ all of them
+ - 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to
+ QdrantClient.async_grpc_points.Search().
Returns:
List of Documents selected by maximal marginal relevance and distance for
each.
"""
results = await self.amax_marginal_relevance_search_with_score_by_vector(
- embedding, k, fetch_k, lambda_mult, **kwargs
+ embedding,
+ k=k,
+ fetch_k=fetch_k,
+ lambda_mult=lambda_mult,
+ filter=filter,
+ search_params=search_params,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
return list(map(itemgetter(0), results))
@@ -816,6 +993,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs selected using the maximal marginal relevance.
@@ -830,6 +1011,27 @@ class Qdrant(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
+ filter: Filter by metadata. Defaults to None.
+ search_params: Additional search params
+ score_threshold:
+ Define a minimal score threshold for the result.
+ If defined, less similar results will not be returned.
+ Score of the returned result might be higher or smaller than the
+ threshold depending on the Distance function used.
+ E.g. for cosine similarity only higher scores will be returned.
+ consistency:
+ Read consistency of the search. Defines how many replicas should be
+ queried before returning the result.
+ Values:
+ - int - number of replicas to query, values should present in all
+ queried replicas
+ - 'majority' - query all replicas, but return values present in the
+ majority of replicas
+ - 'quorum' - query the majority of replicas, return values present in
+ all of them
+ - 'all' - query all replicas, and return values present in all replicas
+ **kwargs:
+ Any other named arguments to pass through to QdrantClient.search()
Returns:
List of Documents selected by maximal marginal relevance and distance for
each.
@@ -841,9 +1043,14 @@ class Qdrant(VectorStore):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
+ query_filter=filter,
+ search_params=search_params,
+ limit=fetch_k,
with_payload=True,
with_vectors=True,
- limit=fetch_k,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ **kwargs,
)
embeddings = [
result.vector.get(self.vector_name) # type: ignore[index, union-attr]
@@ -871,6 +1078,10 @@ class Qdrant(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
+ filter: Optional[MetadataFilter] = None,
+ search_params: Optional[common_types.SearchParams] = None,
+ score_threshold: Optional[float] = None,
+ consistency: Optional[common_types.ReadConsistency] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs selected using the maximal marginal relevance.
@@ -889,18 +1100,17 @@ class Qdrant(VectorStore):
List of Documents selected by maximal marginal relevance and distance for
each.
"""
- from qdrant_client import grpc # noqa
from qdrant_client.conversions.conversion import GrpcToRest
- response = await self.client.async_grpc_points.Search(
- grpc.SearchPoints(
- collection_name=self.collection_name,
- vector_name=self.vector_name,
- vector=embedding,
- with_payload=grpc.WithPayloadSelector(enable=True),
- with_vectors=grpc.WithVectorsSelector(enable=True),
- limit=fetch_k,
- )
+ response = await self._asearch_with_score_by_vector(
+ embedding,
+ k=fetch_k,
+ filter=filter,
+ search_params=search_params,
+ score_threshold=score_threshold,
+ consistency=consistency,
+ with_vectors=True,
+ **kwargs,
)
results = [
GrpcToRest.convert_vectors(result.vectors) for result in response.result
diff --git a/libs/langchain/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py b/libs/langchain/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py
index 5a383b36c..71d1643b7 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py
@@ -1,6 +1,7 @@
from typing import Optional
import pytest
+from qdrant_client import models
from langchain.schema import Document
from langchain.vectorstores import Qdrant
@@ -20,6 +21,17 @@ def test_qdrant_max_marginal_relevance_search(
vector_name: Optional[str],
) -> None:
"""Test end to end construction and MRR search."""
+ filter = models.Filter(
+ must=[
+ models.FieldCondition(
+ key=f"{metadata_payload_key}.page",
+ match=models.MatchValue(
+ value=2,
+ ),
+ ),
+ ],
+ )
+
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Qdrant.from_texts(
@@ -40,3 +52,10 @@ def test_qdrant_max_marginal_relevance_search(
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="baz", metadata={"page": 2}),
]
+
+ output = docsearch.max_marginal_relevance_search(
+ "foo", k=2, fetch_k=3, lambda_mult=0.0, filter=filter
+ )
+ assert output == [
+ Document(page_content="baz", metadata={"page": 2}),
+ ]
From a0800c9f159bdb746ef34a4868f1a5efbbd1c3c1 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 14:20:58 -0700
Subject: [PATCH 116/143] rm google api core and add more dependency testing
(#9721)
---
libs/langchain/poetry.lock | 3 +--
libs/langchain/pyproject.toml | 7 +++----
.../tests/unit_tests/test_dependencies.py | 15 ++++++++++++---
3 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock
index 5b10733d9..badf9ec89 100644
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@@ -3542,7 +3542,6 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
- {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@@ -10448,4 +10447,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
-content-hash = "fd56d0cf338f6efea449244f3e9e719ca6872dd4b3e136ccd67fd82912912cc2"
+content-hash = "88e479307b19d991105360780f67ed3258ef1a0151f70b9e91c86c8153751e83"
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index a35570213..bc626b415 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -125,7 +125,6 @@ newspaper3k = {version = "^0.2.8", optional = true}
amazon-textract-caller = {version = "<2", optional = true}
xata = {version = "^1.0.0a7", optional = true}
xmltodict = {version = "^0.13.0", optional = true}
-google-api-core = {version = "^2.11.1", optional = true}
markdownify = {version = "^0.11.6", optional = true}
assemblyai = {version = "^0.17.0", optional = true}
@@ -307,7 +306,7 @@ extended_testing = [
"chardet",
"esprima",
"jq",
- "pdfminer.six",
+ "pdfminer-six",
"pgvector",
"pypdf",
"pymupdf",
@@ -321,7 +320,7 @@ extended_testing = [
"telethon",
"psychicapi",
"gql",
- "requests_toolbelt",
+ "requests-toolbelt",
"html2text",
"py-trello",
"scikit-learn",
@@ -331,7 +330,7 @@ extended_testing = [
"sympy",
"rapidfuzz",
"openai",
- "rank_bm25",
+ "rank-bm25",
"geopandas",
"jinja2",
"gitpython",
diff --git a/libs/langchain/tests/unit_tests/test_dependencies.py b/libs/langchain/tests/unit_tests/test_dependencies.py
index e446485ca..2e40a7ab4 100644
--- a/libs/langchain/tests/unit_tests/test_dependencies.py
+++ b/libs/langchain/tests/unit_tests/test_dependencies.py
@@ -26,10 +26,13 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
# Get the dependencies from the [tool.poetry.dependencies] section
dependencies = poetry_conf["dependencies"]
- required_dependencies = [
- package_name
+ is_required = {
+ package_name: isinstance(requirements, str)
+ or not requirements.get("optional", False)
for package_name, requirements in dependencies.items()
- if isinstance(requirements, str) or not requirements.get("optional", False)
+ }
+ required_dependencies = [
+ package_name for package_name, required in is_required.items() if required
]
assert sorted(required_dependencies) == [
@@ -47,6 +50,12 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"tenacity",
]
+ unrequired_dependencies = [
+ package_name for package_name, required in is_required.items() if not required
+ ]
+ in_extras = [dep for group in poetry_conf["extras"].values() for dep in group]
+ assert set(unrequired_dependencies) == set(in_extras)
+
def test_test_group_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"""Check if someone is attempting to add additional test dependencies.
From 0f48e6c36eb23a5c5fdcd3c15ec31fa0c4dfd5f5 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 15:06:53 -0700
Subject: [PATCH 117/143] fix integration deps (#9722)
---
.../integration_tests/document_loaders/test_polars_dataframe.py | 2 ++
.../tests/integration_tests/vectorstores/test_elasticsearch.py | 2 +-
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py
index b9e4727d2..03f507012 100644
--- a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from typing import TYPE_CHECKING
import pytest
diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py b/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py
index f2e451ee7..dafba4b1f 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py
@@ -5,7 +5,6 @@ import uuid
from typing import Generator, List, Union
import pytest
-from elasticsearch.helpers import BulkIndexError
from langchain.docstore.document import Document
from langchain.vectorstores.elasticsearch import ElasticsearchStore
@@ -585,6 +584,7 @@ class TestElasticsearch:
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test bulk exception logging is giving better hints."""
+ from elasticsearch.helpers import BulkIndexError
docsearch = ElasticsearchStore(
embedding=ConsistentFakeEmbeddings(),
From dff00ea91ec2101c2886da8d06eeb14b37eca65b Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:23:27 -0700
Subject: [PATCH 118/143] Chat Loaders (#9708)
Still working out interface/notebooks + need discord data dump to test
out things other than copy+paste
Update:
- Going to remove the 'user_id' arg in the loaders themselves and just
standardize on putting the "sender" arg in the extra kwargs. Then can
provide a utility function to map these to ai and human messages
- Going to move the discord one into just a notebook since I don't have
a good dump to test on and copy+paste maybe isn't the greatest thing to
support in v0
- Need to do more testing on slack since it seems the dump only includes
channels and NOT 1 on 1 convos
-
---------
Co-authored-by: Harrison Chase
---
.../integrations/chat_loaders/discord.ipynb | 325 ++++++++++
.../integrations/chat_loaders/facebook.ipynb | 579 ++++++++++++++++++
.../integrations/chat_loaders/index.mdx | 188 ++++++
.../integrations/chat_loaders/slack.ipynb | 163 +++++
.../integrations/chat_loaders/telegram.ipynb | 206 +++++++
.../integrations/chat_loaders/whatsapp.ipynb | 204 ++++++
libs/langchain/langchain/adapters/openai.py | 17 +
.../langchain/chat_loaders/__init__.py | 6 +
libs/langchain/langchain/chat_loaders/base.py | 31 +
.../chat_loaders/facebook_messenger.py | 77 +++
.../langchain/langchain/chat_loaders/slack.py | 84 +++
.../langchain/chat_loaders/telegram.py | 152 +++++
.../langchain/langchain/chat_loaders/utils.py | 86 +++
.../langchain/chat_loaders/whatsapp.py | 116 ++++
.../tests/unit_tests/chat_loaders/__init__.py | 0
.../data/telegram_chat_html/messages.html | 166 +++++
.../data/telegram_chat_json/result.json | 67 ++
.../chat_loaders/data/whatsapp_chat.txt | 12 +
.../unit_tests/chat_loaders/test_slack.py | 24 +
.../unit_tests/chat_loaders/test_telegram.py | 97 +++
.../unit_tests/chat_loaders/test_whatsapp.py | 20 +
21 files changed, 2620 insertions(+)
create mode 100644 docs/extras/integrations/chat_loaders/discord.ipynb
create mode 100644 docs/extras/integrations/chat_loaders/facebook.ipynb
create mode 100644 docs/extras/integrations/chat_loaders/index.mdx
create mode 100644 docs/extras/integrations/chat_loaders/slack.ipynb
create mode 100644 docs/extras/integrations/chat_loaders/telegram.ipynb
create mode 100644 docs/extras/integrations/chat_loaders/whatsapp.ipynb
create mode 100644 libs/langchain/langchain/chat_loaders/__init__.py
create mode 100644 libs/langchain/langchain/chat_loaders/base.py
create mode 100644 libs/langchain/langchain/chat_loaders/facebook_messenger.py
create mode 100644 libs/langchain/langchain/chat_loaders/slack.py
create mode 100644 libs/langchain/langchain/chat_loaders/telegram.py
create mode 100644 libs/langchain/langchain/chat_loaders/utils.py
create mode 100644 libs/langchain/langchain/chat_loaders/whatsapp.py
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/__init__.py
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_html/messages.html
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_json/result.json
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/data/whatsapp_chat.txt
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/test_telegram.py
create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/test_whatsapp.py
diff --git a/docs/extras/integrations/chat_loaders/discord.ipynb b/docs/extras/integrations/chat_loaders/discord.ipynb
new file mode 100644
index 000000000..973b59640
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/discord.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "c4ff9336-1cf3-459e-bd70-d1314c1da6a0",
+ "metadata": {},
+ "source": [
+ "# Discord\n",
+ "\n",
+ "This notebook shows how to create your own chat loader that works on copy-pasted messages (from dms) to a list of LangChain messages.\n",
+ "\n",
+ "The process has four steps:\n",
+ "1. Create the chat .txt file by copying chats from the Discord app and pasting them in a file on your local computer\n",
+ "2. Copy the chat loader definition from below to a local file.\n",
+ "3. Initialize the `DiscordChatLoader` with the file path pointed to the text file.\n",
+ "4. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
+ "\n",
+ "## 1. Creat message dump\n",
+ "\n",
+ "Currently (2023/08/23) this loader only supports .txt files in the format generated by copying messages in the app to your clipboard and pasting in a file. Below is an example."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e4ccfdfa-6869-4d67-90a0-ab99f01b7553",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting discord_chats.txt\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%writefile discord_chats.txt\n",
+ "talkingtower — 08/15/2023 11:10 AM\n",
+ "Love music! Do you like jazz?\n",
+ "reporterbob — 08/15/2023 9:27 PM\n",
+ "Yes! Jazz is fantastic. Ever heard this one?\n",
+ "Website\n",
+ "Listen to classic jazz track...\n",
+ "\n",
+ "talkingtower — Yesterday at 5:03 AM\n",
+ "Indeed! Great choice. 🎷\n",
+ "reporterbob — Yesterday at 5:23 AM\n",
+ "Thanks! How about some virtual sightseeing?\n",
+ "Website\n",
+ "Virtual tour of famous landmarks...\n",
+ "\n",
+ "talkingtower — Today at 2:38 PM\n",
+ "Sounds fun! Let's explore.\n",
+ "reporterbob — Today at 2:56 PM\n",
+ "Enjoy the tour! See you around.\n",
+ "talkingtower — Today at 3:00 PM\n",
+ "Thank you! Goodbye! 👋\n",
+ "reporterbob — Today at 3:02 PM\n",
+ "Farewell! Happy exploring."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "359565a7-dad3-403c-a73c-6414b1295127",
+ "metadata": {},
+ "source": [
+ "## 2. Define chat loader\n",
+ "\n",
+ "LangChain currently does not support "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a429e0c4-4d7d-45f8-bbbb-c7fc5229f6af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import logging\n",
+ "import re\n",
+ "from typing import Iterator, List\n",
+ "\n",
+ "from langchain import schema\n",
+ "from langchain.chat_loaders import base as chat_loaders\n",
+ "\n",
+ "logger = logging.getLogger()\n",
+ "\n",
+ "\n",
+ "class DiscordChatLoader(chat_loaders.BaseChatLoader):\n",
+ " \n",
+ " def __init__(self, path: str):\n",
+ " \"\"\"\n",
+ " Initialize the Discord chat loader.\n",
+ "\n",
+ " Args:\n",
+ " path: Path to the exported Discord chat text file.\n",
+ " \"\"\"\n",
+ " self.path = path\n",
+ " self._message_line_regex = re.compile(\n",
+ " r\"(.+?) — (\\w{3,9} \\d{1,2}(?:st|nd|rd|th)?(?:, \\d{4})? \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
+ " flags=re.DOTALL,\n",
+ " )\n",
+ "\n",
+ " def _load_single_chat_session_from_txt(\n",
+ " self, file_path: str\n",
+ " ) -> chat_loaders.ChatSession:\n",
+ " \"\"\"\n",
+ " Load a single chat session from a text file.\n",
+ "\n",
+ " Args:\n",
+ " file_path: Path to the text file containing the chat messages.\n",
+ "\n",
+ " Returns:\n",
+ " A `ChatSession` object containing the loaded chat messages.\n",
+ " \"\"\"\n",
+ " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+ " lines = file.readlines()\n",
+ "\n",
+ " results: List[schema.BaseMessage] = []\n",
+ " current_sender = None\n",
+ " current_timestamp = None\n",
+ " current_content = []\n",
+ " for line in lines:\n",
+ " if re.match(\n",
+ " r\".+? — (\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
+ " line,\n",
+ " ):\n",
+ " if current_sender and current_content:\n",
+ " results.append(\n",
+ " schema.HumanMessage(\n",
+ " content=\"\".join(current_content).strip(),\n",
+ " additional_kwargs={\n",
+ " \"sender\": current_sender,\n",
+ " \"events\": [{\"message_time\": current_timestamp}],\n",
+ " },\n",
+ " )\n",
+ " )\n",
+ " current_sender, current_timestamp = line.split(\" — \")[:2]\n",
+ " current_content = [\n",
+ " line[len(current_sender) + len(current_timestamp) + 4 :].strip()\n",
+ " ]\n",
+ " elif re.match(r\"\\[\\d{1,2}:\\d{2} (?:AM|PM)\\]\", line.strip()):\n",
+ " results.append(\n",
+ " schema.HumanMessage(\n",
+ " content=\"\".join(current_content).strip(),\n",
+ " additional_kwargs={\n",
+ " \"sender\": current_sender,\n",
+ " \"events\": [{\"message_time\": current_timestamp}],\n",
+ " },\n",
+ " )\n",
+ " )\n",
+ " current_timestamp = line.strip()[1:-1]\n",
+ " current_content = []\n",
+ " else:\n",
+ " current_content.append(\"\\n\" + line.strip())\n",
+ "\n",
+ " if current_sender and current_content:\n",
+ " results.append(\n",
+ " schema.HumanMessage(\n",
+ " content=\"\".join(current_content).strip(),\n",
+ " additional_kwargs={\n",
+ " \"sender\": current_sender,\n",
+ " \"events\": [{\"message_time\": current_timestamp}],\n",
+ " },\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " return chat_loaders.ChatSession(messages=results)\n",
+ "\n",
+ " def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:\n",
+ " \"\"\"\n",
+ " Lazy load the messages from the chat file and yield them in the required format.\n",
+ "\n",
+ " Yields:\n",
+ " A `ChatSession` object containing the loaded chat messages.\n",
+ " \"\"\"\n",
+ " yield self._load_single_chat_session_from_txt(self.path)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c8240393-48be-44d2-b0d6-52c215cd8ac2",
+ "metadata": {},
+ "source": [
+ "## 2. Create loader\n",
+ "\n",
+ "We will point to the file we just wrote to disk."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "1268de40-b0e5-445d-9cd8-54856cd0293a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = DiscordChatLoader(\n",
+ " path=\"./discord_chats.txt\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4928df4b-ae31-48a7-bd76-be3ecee1f3e0",
+ "metadata": {},
+ "source": [
+ "## 3. Load Messages\n",
+ "\n",
+ "Assuming the format is correct, the loader will convert the chats to langchain messages."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "c8a0836d-4a22-4790-bfe9-97f2145bb0d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import List\n",
+ "from langchain.chat_loaders.base import ChatSession\n",
+ "from langchain.chat_loaders.utils import (\n",
+ " map_ai_messages,\n",
+ " merge_chat_runs,\n",
+ ")\n",
+ "\n",
+ "raw_messages = loader.lazy_load()\n",
+ "# Merge consecutive messages from the same sender into a single message\n",
+ "merged_messages = merge_chat_runs(raw_messages)\n",
+ "# Convert messages from \"talkingtower\" to AI messages\n",
+ "messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"talkingtower\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "1913963b-c44e-4f7a-aba7-0423c9b8bd59",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'messages': [AIMessage(content='Love music! Do you like jazz?', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': '08/15/2023 11:10 AM\\n'}]}, example=False),\n",
+ " HumanMessage(content='Yes! Jazz is fantastic. Ever heard this one?\\nWebsite\\nListen to classic jazz track...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': '08/15/2023 9:27 PM\\n'}]}, example=False),\n",
+ " AIMessage(content='Indeed! Great choice. 🎷', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Yesterday at 5:03 AM\\n'}]}, example=False),\n",
+ " HumanMessage(content='Thanks! How about some virtual sightseeing?\\nWebsite\\nVirtual tour of famous landmarks...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Yesterday at 5:23 AM\\n'}]}, example=False),\n",
+ " AIMessage(content=\"Sounds fun! Let's explore.\", additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 2:38 PM\\n'}]}, example=False),\n",
+ " HumanMessage(content='Enjoy the tour! See you around.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 2:56 PM\\n'}]}, example=False),\n",
+ " AIMessage(content='Thank you! Goodbye! 👋', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 3:00 PM\\n'}]}, example=False),\n",
+ " HumanMessage(content='Farewell! Happy exploring.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 3:02 PM\\n'}]}, example=False)]}]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "messages"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8595a518-5c89-44aa-94a7-ca51e7e2a5fa",
+ "metadata": {},
+ "source": [
+ "### Next Steps\n",
+ "\n",
+ "You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "08ff0a1e-fca0-4da3-aacd-d7401f99d946",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Thank you! Have a wonderful day! 🌟"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "llm = ChatOpenAI()\n",
+ "\n",
+ "for chunk in llm.stream(messages[0]['messages']):\n",
+ " print(chunk.content, end=\"\", flush=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "50a5251f-074a-4a3c-a2b0-b1de85e0ac6a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/integrations/chat_loaders/facebook.ipynb b/docs/extras/integrations/chat_loaders/facebook.ipynb
new file mode 100644
index 000000000..ca4ddce0e
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/facebook.ipynb
@@ -0,0 +1,579 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "e4bd269b",
+ "metadata": {},
+ "source": [
+ "# Facebook Messenger\n",
+ "\n",
+ "This notebook shows how to load data from Facebook in a format you can finetune on. The overall steps are:\n",
+ "\n",
+ "1. Download your messenger data to disk.\n",
+ "2. Create the Chat Loader and call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
+ "3. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class. Once you've done this, call `convert_messages_for_finetuning` to prepare your data for fine-tuning.\n",
+ "\n",
+ "\n",
+ "Once this has been done, you can fine-tune your model. To do so you would complete the following steps:\n",
+ "\n",
+ "4. Upload your messages to OpenAI and run a fine-tuning job.\n",
+ "6. Use the resulting model in your LangChain app!\n",
+ "\n",
+ "\n",
+ "Let's begin.\n",
+ "\n",
+ "\n",
+ "## 1. Download Data\n",
+ "\n",
+ "To download your own messenger data, following instructions [here](https://www.zapptales.com/en/download-facebook-messenger-chat-history-how-to/). IMPORTANT - make sure to download them in JSON format (not HTML).\n",
+ "\n",
+ "We are hosting an example dump at [this google drive link](https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing) that we will use in this walkthrough."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "647f2158-a42e-4634-b283-b8492caf542a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File file.zip downloaded.\n",
+ "File file.zip has been unzipped.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# This uses some example data\n",
+ "import requests\n",
+ "import zipfile\n",
+ "\n",
+ "def download_and_unzip(url: str, output_path: str = 'file.zip') -> None:\n",
+ " file_id = url.split('/')[-2]\n",
+ " download_url = f'https://drive.google.com/uc?export=download&id={file_id}'\n",
+ "\n",
+ " response = requests.get(download_url)\n",
+ " if response.status_code != 200:\n",
+ " print('Failed to download the file.')\n",
+ " return\n",
+ "\n",
+ " with open(output_path, 'wb') as file:\n",
+ " file.write(response.content)\n",
+ " print(f'File {output_path} downloaded.')\n",
+ "\n",
+ " with zipfile.ZipFile(output_path, 'r') as zip_ref:\n",
+ " zip_ref.extractall()\n",
+ " print(f'File {output_path} has been unzipped.')\n",
+ "\n",
+ "# URL of the file to download\n",
+ "url = 'https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing'\n",
+ "\n",
+ "# Download and unzip\n",
+ "download_and_unzip(url)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48ef8bb1-fc28-453c-835a-94a552f05a91",
+ "metadata": {},
+ "source": [
+ "## 2. Create Chat Loader\n",
+ "\n",
+ "We have 2 different `FacebookMessengerChatLoader` classes, one for an entire directory of chats, and one to load individual files. We"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a0869bc6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "directory_path = \"./hogwarts\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "0460bf25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_loaders.facebook_messenger import (\n",
+ " SingleFileFacebookMessengerChatLoader,\n",
+ " FolderFacebookMessengerChatLoader,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "f61ee277",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = SingleFileFacebookMessengerChatLoader(\n",
+ " path=\"./hogwarts/inbox/HermioneGranger/messages_Hermione_Granger.json\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ec466ad7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[HumanMessage(content=\"Hi Hermione! How's your summer going so far?\", additional_kwargs={'sender': 'Harry Potter'}, example=False),\n",
+ " HumanMessage(content=\"Harry! Lovely to hear from you. My summer is going well, though I do miss everyone. I'm spending most of my time going through my books and researching fascinating new topics. How about you?\", additional_kwargs={'sender': 'Hermione Granger'}, example=False),\n",
+ " HumanMessage(content=\"I miss you all too. The Dursleys are being their usual unpleasant selves but I'm getting by. At least I can practice some spells in my room without them knowing. Let me know if you find anything good in your researching!\", additional_kwargs={'sender': 'Harry Potter'}, example=False)]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat_session = loader.load()[0]\n",
+ "chat_session[\"messages\"][:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "8a3ee473",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = FolderFacebookMessengerChatLoader(\n",
+ " path=\"./hogwarts\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "9f41e122",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat_sessions = loader.load()\n",
+ "len(chat_sessions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4aa3580-adc1-4b48-9bba-0e8e8d9f44ce",
+ "metadata": {},
+ "source": [
+ "## 3. Prepare for fine-tuning\n",
+ "\n",
+ "Calling `load()` returns all the chat messages we could extract as human messages. When conversing with chat bots, conversations typically follow a more strict alternating dialogue pattern relative to real conversations. \n",
+ "\n",
+ "You can choose to merge message \"runs\" (consecutive messages from the same sender) and select a sender to represent the \"AI\". The fine-tuned LLM will learn to generate these AI messages."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "5a78030d-b757-4bbe-8a6c-841056f46df7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_loaders.utils import (\n",
+ " merge_chat_runs,\n",
+ " map_ai_messages,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "ff35b028-78bf-4c5b-9ec6-939fe67de7f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_sessions = merge_chat_runs(chat_sessions)\n",
+ "alternating_sessions = list(map_ai_messages(merged_sessions, \"Harry Potter\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "4b11906e-a496-4d01-9f0d-1938c14147bf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[AIMessage(content=\"Professor Snape, I was hoping I could speak with you for a moment about something that's been concerning me lately.\", additional_kwargs={'sender': 'Harry Potter'}, example=False),\n",
+ " HumanMessage(content=\"What is it, Potter? I'm quite busy at the moment.\", additional_kwargs={'sender': 'Severus Snape'}, example=False),\n",
+ " AIMessage(content=\"I apologize for the interruption, sir. I'll be brief. I've noticed some strange activity around the school grounds at night. I saw a cloaked figure lurking near the Forbidden Forest last night. I'm worried someone may be plotting something sinister.\", additional_kwargs={'sender': 'Harry Potter'}, example=False)]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Now all of Harry Potter's messages will take the AI message class\n",
+ "# which maps to the 'assistant' role in OpenAI's training format\n",
+ "alternating_sessions[0]['messages'][:3]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d985478d-062e-47b9-ae9a-102f59be07c0",
+ "metadata": {},
+ "source": [
+ "#### Now we can convert to OpenAI format dictionaries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "21372331",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.adapters.openai import convert_messages_for_finetuning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "92c5ae7a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Prepared 9 dialogues for training\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_data = convert_messages_for_finetuning(alternating_sessions)\n",
+ "print(f\"Prepared {len(training_data)} dialogues for training\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "dfcbd181",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'role': 'assistant',\n",
+ " 'content': \"Professor Snape, I was hoping I could speak with you for a moment about something that's been concerning me lately.\"},\n",
+ " {'role': 'user',\n",
+ " 'content': \"What is it, Potter? I'm quite busy at the moment.\"},\n",
+ " {'role': 'assistant',\n",
+ " 'content': \"I apologize for the interruption, sir. I'll be brief. I've noticed some strange activity around the school grounds at night. I saw a cloaked figure lurking near the Forbidden Forest last night. I'm worried someone may be plotting something sinister.\"}]"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "training_data[0][:3]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1a9fd64-4f9f-42d3-b5dc-2a340e51e9e7",
+ "metadata": {},
+ "source": [
+ "OpenAI currently requires at least 10 training examples for a fine-tuning job, though they recommend between 50-100 for most tasks. Since we only have 9 chat sessions, we can subdivide them (optionally with some overlap) so that each training example is comprised of a portion of a whole conversation.\n",
+ "\n",
+ "Facebook chat sessions (1 per person) often span multiple days and conversations,\n",
+ "so the long-range dependencies may not be that important to model anyhow."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "13cd290a-b1e9-4686-bb5e-d99de8b8612b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "100"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Our chat is alternating, we will make each datapoint a group of 8 messages,\n",
+ "# with 2 messages overlapping\n",
+ "chunk_size = 8\n",
+ "overlap = 2\n",
+ "\n",
+ "training_examples = [\n",
+ " conversation_messages[i: i + chunk_size] \n",
+ " for conversation_messages in training_data\n",
+ " for i in range(\n",
+ " 0, len(conversation_messages) - chunk_size + 1, \n",
+ " chunk_size - overlap)\n",
+ "]\n",
+ "\n",
+ "len(training_examples)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cc8baf41-ff07-4492-96bd-b2472ee7cef9",
+ "metadata": {},
+ "source": [
+ "## 4. Fine-tune the model\n",
+ "\n",
+ "It's time to fine-tune the model. Make sure you have `openai` installed\n",
+ "and have set your `OPENAI_API_KEY` appropriately"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "95ce3f63-3c80-44b2-9060-534ad74e16fa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %pip install -U openai --quiet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "ab9e28eb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File file-zCyNBeg4snpbBL7VkvsuhCz8 ready afer 30.55 seconds.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "from io import BytesIO\n",
+ "import time\n",
+ "\n",
+ "import openai\n",
+ "\n",
+ "# We will write the jsonl file in memory\n",
+ "my_file = BytesIO()\n",
+ "for m in training_examples:\n",
+ " my_file.write((json.dumps({\"messages\": m}) + \"\\n\").encode('utf-8'))\n",
+ "\n",
+ "my_file.seek(0)\n",
+ "training_file = openai.File.create(\n",
+ " file=my_file,\n",
+ " purpose='fine-tune'\n",
+ ")\n",
+ "\n",
+ "# OpenAI audits each training file for compliance reasons.\n",
+ "# This make take a few minutes\n",
+ "status = openai.File.retrieve(training_file.id).status\n",
+ "start_time = time.time()\n",
+ "while status != \"processed\":\n",
+ " print(f\"Status=[{status}]... {time.time() - start_time:.2f}s\", end=\"\\r\", flush=True)\n",
+ " time.sleep(5)\n",
+ " status = openai.File.retrieve(training_file.id).status\n",
+ "print(f\"File {training_file.id} ready after {time.time() - start_time:.2f} seconds.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "759a7f51-fde9-4b75-aaa9-e600e6537bd1",
+ "metadata": {},
+ "source": [
+ "With the file ready, it's time to kick off a training job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "3f451425",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job = openai.FineTuningJob.create(\n",
+ " training_file=training_file.id,\n",
+ " model=\"gpt-3.5-turbo\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "489b23ef-5e14-42a9-bafb-44220ec6960b",
+ "metadata": {},
+ "source": [
+ "Grab a cup of tea while your model is being prepared. This may take some time!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "bac1637a-c087-4523-ade1-c47f9bf4c6f4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Status=[running]... 908.87s\r"
+ ]
+ }
+ ],
+ "source": [
+ "status = openai.FineTuningJob.retrieve(job.id).status\n",
+ "start_time = time.time()\n",
+ "while status != \"succeeded\":\n",
+ " print(f\"Status=[{status}]... {time.time() - start_time:.2f}s\", end=\"\\r\", flush=True)\n",
+ " time.sleep(5)\n",
+ " job = openai.FineTuningJob.retrieve(job.id)\n",
+ " status = job.status"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "535895e1-bc69-40e5-82ed-e24ed2baeeee",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ft:gpt-3.5-turbo-0613:personal::7rDwkaOq\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(job.fine_tuned_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "502ff73b-f9e9-49ce-ba45-401811e57946",
+ "metadata": {},
+ "source": [
+ "## 5. Use in LangChain\n",
+ "\n",
+ "You can use the resulting model ID directly the `ChatOpenAI` model class."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "3925d60d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "model = ChatOpenAI(\n",
+ " model=job.fine_tuned_model,\n",
+ " temperature=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "7190cf2e-ab34-4ceb-bdad-45f24f069c29",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.prompts import ChatPromptTemplate\n",
+ "from langchain.schema.output_parser import StrOutputParser\n",
+ "\n",
+ "prompt = ChatPromptTemplate.from_messages(\n",
+ " [\n",
+ " (\"human\", \"{input}\"),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "chain = prompt | model | StrOutputParser()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "f02057e9-f914-40b1-9c9d-9432ff594b98",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The usual - Potions, Transfiguration, Defense Against the Dark Arts. What about you?"
+ ]
+ }
+ ],
+ "source": [
+ "for tok in chain.stream({\"input\": \"What classes are you taking?\"}):\n",
+ " print(tok, end=\"\", flush=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "35331503-3cc6-4d64-955e-64afe6b5fef3",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/integrations/chat_loaders/index.mdx b/docs/extras/integrations/chat_loaders/index.mdx
new file mode 100644
index 000000000..be3f38ff5
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/index.mdx
@@ -0,0 +1,188 @@
+---
+sidebar_position: 0
+---
+
+# Chat loaders
+
+Like document loaders, chat loaders are utilities designed to help load conversations from popular communication platforms such as Facebook, Slack, Discord, etc. These are loaded into memory as LangChain chat message objects. Such utilities facilitate tasks such as fine-tuning a language model to match your personal style or voice.
+
+This brief guide will illustrate the process using [OpenAI's fine-tuning API](https://platform.openai.com/docs/guides/fine-tuning) comprised of six steps:
+
+1. Export your Facebook Messenger chat data in a compatible format for your intended chat loader.
+2. Load the chat data into memory as LangChain chat message objects. (_this is what is covered in each integration notebook in this section of the documentation_).
+ - Assign a person to the "AI" role and optionally filter, group, and merge messages.
+3. Export these acquired messages in a format expected by the fine-tuning API.
+4. Upload this data to OpenAI.
+5. Fine-tune your model.
+6. Implement the fine-tuned model in LangChain.
+
+This guide is not wholly comprehensive but is designed to take you through the fundamentals of going from raw data to fine-tuned model.
+
+We will demonstrate the procedure through an example of fine-tuning a `gpt-3.5-turbo` model on Facebook Messenger data.
+
+### 1. Export your chat data
+
+To export your Facebook messenger data, you can follow the [instructions here](https://www.zapptales.com/en/download-facebook-messenger-chat-history-how-to/).
+
+:::important JSON format
+You must select "JSON format" (instead of HTML) when exporting your data to be compatible with the current loader.
+:::
+
+OpenAI requires at least 10 examples to fine-tune your model, but they recommend between 50-100 for more optimal results.
+You can use the example data stored at [this google drive link](https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing) to test the process.
+
+### 2. Load the chat
+
+Once you've obtained your chat data, you can load it into memory as LangChain chat message objects. Here’s an example of loading data using the Python code:
+
+```python
+from langchain.chat_loaders.facebook_messenger import FolderFacebookMessengerChatLoader
+
+loader = FolderFacebookMessengerChatLoader(
+ path="./facebook_messenger_chats",
+)
+
+chat_sessions = loader.load()
+```
+
+In this snippet, we point the loader to a directory of Facebook chat dumps which are then loaded as multiple "sessions" of messages, one session per conversation file.
+
+Once you've loaded the messages, you should decide which person you want to fine-tune the model to (usually yourself). You can also decide to merge consecutive messages from the same sender into a single chat message.
+For both of these tasks, you can use the chat_loaders utilities to do so:
+
+```
+from langchain.chat_loaders.utils import (
+ merge_chat_runs,
+ map_ai_messages,
+)
+
+merged_sessions = merge_chat_runs(chat_sessions)
+alternating_sessions = list(map_ai_messages(merged_sessions, "My Name"))
+```
+
+### 3. Export messages to OpenAI format
+
+Convert the chat messages to dictionaries using the `convert_messages_for_finetuning` function. Then, group the data into chunks for better context modeling and overlap management.
+
+```python
+from langchain.adapters.openai import convert_messages_for_finetuning
+
+openai_messages = convert_messages_for_finetuning(chat_sessions)
+```
+
+At this point, the data is ready for upload to OpenAI. You can choose to split up conversations into smaller chunks for training if you
+do not have enough conversations to train on. Feel free to play around with different chunk sizes or with adding system messages to the fine-tuning data.
+
+```python
+chunk_size = 8
+overlap = 2
+
+message_groups = [
+ conversation_messages[i: i + chunk_size]
+ for conversation_messages in openai_messages
+ for i in range(
+ 0, len(conversation_messages) - chunk_size + 1,
+ chunk_size - overlap)
+]
+
+len(message_groups)
+# 9
+```
+
+### 4. Upload the data to OpenAI
+
+Ensure you have set your OpenAI API key by following these [instructions](https://platform.openai.com/account/api-keys), then upload the training file.
+An audit is performed to ensure data compliance, so you may have to wait a few minutes for the dataset to become ready for use.
+
+```python
+import time
+import json
+import io
+
+import openai
+
+my_file = io.BytesIO()
+for group in message_groups:
+ my_file.write((json.dumps({"messages": group}) + "\n").encode('utf-8'))
+
+my_file.seek(0)
+training_file = openai.File.create(
+ file=my_file,
+ purpose='fine-tune'
+)
+
+# Wait while the file is processed
+status = openai.File.retrieve(training_file.id).status
+start_time = time.time()
+while status != "processed":
+ print(f"Status=[{status}]... {time.time() - start_time:.2f}s", end="\r", flush=True)
+ time.sleep(5)
+ status = openai.File.retrieve(training_file.id).status
+print(f"File {training_file.id} ready after {time.time() - start_time:.2f} seconds.")
+```
+
+Once this is done, you can proceed to the model training!
+
+### 5. Fine-tune the model
+
+Start the fine-tuning job with your chosen base model.
+
+```python
+job = openai.FineTuningJob.create(
+ training_file=training_file.id,
+ model="gpt-3.5-turbo",
+)
+```
+
+This might take a while. Check the status with `openai.FineTuningJob.retrieve(job.id).status` and wait for it to report `succeeded`.
+
+```python
+# It may take 10-20+ minutes to complete training.
+status = openai.FineTuningJob.retrieve(job.id).status
+start_time = time.time()
+while status != "succeeded":
+ print(f"Status=[{status}]... {time.time() - start_time:.2f}s", end="\r", flush=True)
+ time.sleep(5)
+ job = openai.FineTuningJob.retrieve(job.id)
+ status = job.status
+```
+
+### 6. Use the model in LangChain
+
+You're almost there! Use the fine-tuned model in LangChain.
+
+```python
+from langchain import chat_models
+
+model_name = job.fine_tuned_model
+# Example: ft:gpt-3.5-turbo-0613:personal::5mty86jblapsed
+model = chat_models.ChatOpenAI(model=model_name)
+```
+
+```python
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+
+prompt = ChatPromptTemplate.from_messages(
+ [
+ ("human", "{input}"),
+ ]
+)
+
+chain = prompt | model | StrOutputParser()
+
+for tok in chain.stream({"input": "What classes are you taking?"}):
+ print(tok, end="", flush=True)
+
+# The usual - Potions, Transfiguration, Defense Against the Dark Arts. What about you?
+```
+
+And that's it! You've successfully fine-tuned a model and used it in LangChain.
+
+## Supported Chat Loaders
+
+LangChain currently supports the following chat loaders. Feel free to contribute more!
+
+import DocCardList from "@theme/DocCardList";
+
+
\ No newline at end of file
diff --git a/docs/extras/integrations/chat_loaders/slack.ipynb b/docs/extras/integrations/chat_loaders/slack.ipynb
new file mode 100644
index 000000000..f63cd7cdf
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/slack.ipynb
@@ -0,0 +1,163 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "01fcfa2f-33a9-48f3-835a-b1956c394d6b",
+ "metadata": {},
+ "source": [
+ "# Slack\n",
+ "\n",
+ "This notebook shows how to use the Slack chat loader. This class helps map exported slack conversations to LangChain chat messages.\n",
+ "\n",
+ "The process has three steps:\n",
+ "1. Export the desired conversation thread by following the [instructions here](https://slack.com/help/articles/1500001548241-Request-to-export-all-conversations).\n",
+ "2. Create the `SlackChatLoader` with the file path pointed to the json file or directory of JSON files\n",
+ "3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class.\n",
+ "\n",
+ "## 1. Creat message dump\n",
+ "\n",
+ "Currently (2023/08/23) this loader best supports a zip directory of files in the format generated by exporting your a direct message converstion from Slack. Follow up-to-date instructions from slack on how to do so.\n",
+ "\n",
+ "We have an example in the LangChain repo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a79d35bf-5f21-4063-84bf-a60845c1c51f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "permalink = \"https://raw.githubusercontent.com/langchain-ai/langchain/342087bdfa3ac31d622385d0f2d09cf5e06c8db3/libs/langchain/tests/integration_tests/examples/slack_export.zip\"\n",
+ "response = requests.get(permalink)\n",
+ "with open(\"slack_dump.zip\", \"wb\") as f:\n",
+ " f.write(response.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cf60f703-76f1-4602-a723-02c59535c1af",
+ "metadata": {},
+ "source": [
+ "## 2. Create the Chat Loader\n",
+ "\n",
+ "Provide the loader with the file path to the zip directory. You can optionally specify the user id that maps to an ai message as well an configure whether to merge message runs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "4b8b432a-d2bc-49e1-b35f-761730a8fd6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_loaders.slack import SlackChatLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8ec6661b-0aca-48ae-9e2b-6412856c287b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = SlackChatLoader(\n",
+ " path=\"slack_dump.zip\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8805a7c5-84b4-49f5-8989-0022f2054ace",
+ "metadata": {},
+ "source": [
+ "## 3. Load messages\n",
+ "\n",
+ "The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently just contain a list of messages per loaded conversation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "fcd69b3e-020d-4a15-8a0d-61c2d34e1ee1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import List\n",
+ "from langchain.chat_loaders.base import ChatSession\n",
+ "from langchain.chat_loaders.utils import (\n",
+ " map_ai_messages,\n",
+ " merge_chat_runs,\n",
+ ")\n",
+ "\n",
+ "raw_messages = loader.lazy_load()\n",
+ "# Merge consecutive messages from the same sender into a single message\n",
+ "merged_messages = merge_chat_runs(raw_messages)\n",
+ "# Convert messages from \"U0500003428\" to AI messages\n",
+ "messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"U0500003428\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7d033f87-cd0c-4f44-a753-41b871c1e919",
+ "metadata": {},
+ "source": [
+ "### Next Steps\n",
+ "\n",
+ "You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "7d8a1629-5d9e-49b3-b978-3add57027d59",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Hi, \n",
+ "\n",
+ "I hope you're doing well. I wanted to reach out and ask if you'd be available to meet up for coffee sometime next week. I'd love to catch up and hear about what's been going on in your life. Let me know if you're interested and we can find a time that works for both of us. \n",
+ "\n",
+ "Looking forward to hearing from you!\n",
+ "\n",
+ "Best, [Your Name]"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "llm = ChatOpenAI()\n",
+ "\n",
+ "for chunk in llm.stream(messages[1]['messages']):\n",
+ " print(chunk.content, end=\"\", flush=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/integrations/chat_loaders/telegram.ipynb b/docs/extras/integrations/chat_loaders/telegram.ipynb
new file mode 100644
index 000000000..156472691
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/telegram.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "735455a6-f82e-4252-b545-27385ef883f4",
+ "metadata": {},
+ "source": [
+ "# Telegram\n",
+ "\n",
+ "This notebook shows how to use the Telegram chat loader. This class helps map exported Telegram conversations to LangChain chat messages.\n",
+ "\n",
+ "The process has three steps:\n",
+ "1. Export the chat .txt file by copying chats from the Discord app and pasting them in a file on your local computer\n",
+ "2. Create the `TelegramChatLoader` with the file path pointed to the json file or directory of JSON files\n",
+ "3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class.\n",
+ "\n",
+ "## 1. Creat message dump\n",
+ "\n",
+ "Currently (2023/08/23) this loader best supports json files in the format generated by exporting your chat history from the [Telegram Desktop App](https://desktop.telegram.org/).\n",
+ "\n",
+ "**Important:** There are 'lite' versions of telegram such as \"Telegram for MacOS\" that lack the export functionality. Please make sure you use the correct app to export the file.\n",
+ "\n",
+ "To make the export:\n",
+ "1. Download and open telegram desktop\n",
+ "2. Select a conversation\n",
+ "3. Navigate to the conversation settings (currently the three dots in the top right corner)\n",
+ "4. Click \"Export Chat History\"\n",
+ "5. Unselect photos and other media. Select \"Machine-readable JSON\" format to export.\n",
+ "\n",
+ "An example is below: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "285f2044-0f58-4b92-addb-9f8569076734",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting telegram_conversation.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%writefile telegram_conversation.json\n",
+ "{\n",
+ " \"name\": \"Jiminy\",\n",
+ " \"type\": \"personal_chat\",\n",
+ " \"id\": 5965280513,\n",
+ " \"messages\": [\n",
+ " {\n",
+ " \"id\": 1,\n",
+ " \"type\": \"message\",\n",
+ " \"date\": \"2023-08-23T13:11:23\",\n",
+ " \"date_unixtime\": \"1692821483\",\n",
+ " \"from\": \"Jiminy Cricket\",\n",
+ " \"from_id\": \"user123450513\",\n",
+ " \"text\": \"You better trust your conscience\",\n",
+ " \"text_entities\": [\n",
+ " {\n",
+ " \"type\": \"plain\",\n",
+ " \"text\": \"You better trust your conscience\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"id\": 2,\n",
+ " \"type\": \"message\",\n",
+ " \"date\": \"2023-08-23T13:13:20\",\n",
+ " \"date_unixtime\": \"1692821600\",\n",
+ " \"from\": \"Batman & Robin\",\n",
+ " \"from_id\": \"user6565661032\",\n",
+ " \"text\": \"What did you just say?\",\n",
+ " \"text_entities\": [\n",
+ " {\n",
+ " \"type\": \"plain\",\n",
+ " \"text\": \"What did you just say?\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cc109f4-4c92-4cd3-8143-c322776c3f03",
+ "metadata": {},
+ "source": [
+ "## 2. Create the Chat Loader\n",
+ "\n",
+ "All that's required is the file path. You can optionally specify the user name that maps to an ai message as well an configure whether to merge message runs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "111f7767-573c-42d4-86f0-bd766bbaa071",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_loaders.telegram import TelegramChatLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "a4226efa-2640-4990-a20c-6861d1887329",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = TelegramChatLoader(\n",
+ " path=\"./telegram_conversation.json\", \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71699fb7-7815-4c89-8d96-30e8fada6923",
+ "metadata": {},
+ "source": [
+ "## 3. Load messages\n",
+ "\n",
+ "The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently just contain a list of messages per loaded conversation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "81121efb-c875-4a77-ad1e-fe26b3d7e812",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import List\n",
+ "from langchain.chat_loaders.base import ChatSession\n",
+ "from langchain.chat_loaders.utils import (\n",
+ " map_ai_messages,\n",
+ " merge_chat_runs,\n",
+ ")\n",
+ "\n",
+ "raw_messages = loader.lazy_load()\n",
+ "# Merge consecutive messages from the same sender into a single message\n",
+ "merged_messages = merge_chat_runs(raw_messages)\n",
+ "# Convert messages from \"Jiminy Cricket\" to AI messages\n",
+ "messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"Jiminy Cricket\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9089c05-7375-41ca-a2f9-672a845314e4",
+ "metadata": {},
+ "source": [
+ "### Next Steps\n",
+ "\n",
+ "You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "637a6f5d-6944-4722-9361-a76ef5e9dd2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "I said, \"You better trust your conscience.\""
+ ]
+ }
+ ],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "llm = ChatOpenAI()\n",
+ "\n",
+ "for chunk in llm.stream(messages[0]['messages']):\n",
+ " print(chunk.content, end=\"\", flush=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/integrations/chat_loaders/whatsapp.ipynb b/docs/extras/integrations/chat_loaders/whatsapp.ipynb
new file mode 100644
index 000000000..a08155843
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/whatsapp.ipynb
@@ -0,0 +1,204 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "735455a6-f82e-4252-b545-27385ef883f4",
+ "metadata": {},
+ "source": [
+ "# WhatsApp\n",
+ "\n",
+ "This notebook shows how to use the WhatsApp chat loader. This class helps map exported Telegram conversations to LangChain chat messages.\n",
+ "\n",
+ "The process has three steps:\n",
+ "1. Export the chat conversations to computer\n",
+ "2. Create the `WhatsAppChatLoader` with the file path pointed to the json file or directory of JSON files\n",
+ "3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
+ "\n",
+ "## 1. Creat message dump\n",
+ "\n",
+ "To make the export of your WhatsApp conversation(s), complete the following steps:\n",
+ "\n",
+ "1. Open the target conversation\n",
+ "2. Click the three dots in the top right corner and select \"More\".\n",
+ "3. Then select \"Export chat\" and choose \"Without media\".\n",
+ "\n",
+ "An example of the data format for each converation is below: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "285f2044-0f58-4b92-addb-9f8569076734",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Writing whatsapp_chat.txt\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%writefile whatsapp_chat.txt\n",
+ "[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.\n",
+ "[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!\n",
+ "[8/15/23, 9:12:48 AM] Dr. Feather: image omitted\n",
+ "[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?\n",
+ "[8/15/23, 9:13:23 AM] Dr. Feather: image omitted\n",
+ "[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.\n",
+ "[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?\n",
+ "[8/15/23, 9:14:30 AM] Dr. Feather: image omitted\n",
+ "[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.\n",
+ "[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.\n",
+ "[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.\n",
+ "[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cc109f4-4c92-4cd3-8143-c322776c3f03",
+ "metadata": {},
+ "source": [
+ "## 2. Create the Chat Loader\n",
+ "\n",
+ "The WhatsAppChatLoader accepts the resulting zip file, unzipped directory, or the path to any of the chat `.txt` files therein.\n",
+ "\n",
+ "Provide that as well as the user name you want to take on the role of \"AI\" when finetuning."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "111f7767-573c-42d4-86f0-bd766bbaa071",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.chat_loaders.whatsapp import WhatsAppChatLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a4226efa-2640-4990-a20c-6861d1887329",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = WhatsAppChatLoader(\n",
+ " path=\"./whatsapp_chat.txt\", \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71699fb7-7815-4c89-8d96-30e8fada6923",
+ "metadata": {},
+ "source": [
+ "## 3. Load messages\n",
+ "\n",
+ "The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently store the list of messages per loaded conversation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "81121efb-c875-4a77-ad1e-fe26b3d7e812",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'messages': [AIMessage(content='I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!', additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:12:43 AM'}]}, example=False),\n",
+ " HumanMessage(content=\"That's stunning! Were you able to observe its behavior?\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:13:15 AM'}]}, example=False),\n",
+ " AIMessage(content=\"Yes, it seemed quite social with other macaws. They're known for their playful nature.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:14:02 AM'}]}, example=False),\n",
+ " HumanMessage(content=\"How's the research going on parrot communication?\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:14:15 AM'}]}, example=False),\n",
+ " AIMessage(content=\"It's progressing well. We're learning so much about how they use sound and color to communicate.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:14:50 AM'}]}, example=False),\n",
+ " HumanMessage(content=\"That's fascinating! Can't wait to read your paper on it.\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:15:10 AM'}]}, example=False),\n",
+ " AIMessage(content=\"Thank you! I'll send you a draft soon.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:15:20 AM'}]}, example=False),\n",
+ " HumanMessage(content='Looking forward to it! Keep up the great work.', additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:25:16 PM'}]}, example=False)]}]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from typing import List\n",
+ "from langchain.chat_loaders.base import ChatSession\n",
+ "from langchain.chat_loaders.utils import (\n",
+ " map_ai_messages,\n",
+ " merge_chat_runs,\n",
+ ")\n",
+ "\n",
+ "raw_messages = loader.lazy_load()\n",
+ "# Merge consecutive messages from the same sender into a single message\n",
+ "merged_messages = merge_chat_runs(raw_messages)\n",
+ "# Convert messages from \"Dr. Feather\" to AI messages\n",
+ "messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"Dr. Feather\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9089c05-7375-41ca-a2f9-672a845314e4",
+ "metadata": {},
+ "source": [
+ "### Next Steps\n",
+ "\n",
+ "You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "637a6f5d-6944-4722-9361-a76ef5e9dd2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Thank you for the encouragement! I'll do my best to continue studying and sharing fascinating insights about parrot communication."
+ ]
+ }
+ ],
+ "source": [
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "llm = ChatOpenAI()\n",
+ "\n",
+ "for chunk in llm.stream(messages[0]['messages']):\n",
+ " print(chunk.content, end=\"\", flush=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "16156643-cfbd-444f-b4ae-198eb44f0267",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/langchain/langchain/adapters/openai.py b/libs/langchain/langchain/adapters/openai.py
index e9f18408c..b846d4f46 100644
--- a/libs/langchain/langchain/adapters/openai.py
+++ b/libs/langchain/langchain/adapters/openai.py
@@ -15,6 +15,7 @@ from typing import (
from typing_extensions import Literal
+from langchain.chat_loaders.base import ChatSession
from langchain.schema.messages import (
AIMessage,
AIMessageChunk,
@@ -206,3 +207,19 @@ class ChatCompletion:
_convert_message_chunk_to_delta(c, i)
async for i, c in aenumerate(model_config.astream(converted_messages))
)
+
+
+def _has_assistant_message(session: ChatSession) -> bool:
+ """Check if chat session has an assistant message."""
+ return any([isinstance(m, AIMessage) for m in session["messages"]])
+
+
+def convert_messages_for_finetuning(
+ sessions: Iterable[ChatSession],
+) -> List[List[dict]]:
+ """Convert messages to a list of lists of dictionaries for fine-tuning."""
+ return [
+ [convert_message_to_dict(s) for s in session["messages"]]
+ for session in sessions
+ if _has_assistant_message(session)
+ ]
diff --git a/libs/langchain/langchain/chat_loaders/__init__.py b/libs/langchain/langchain/chat_loaders/__init__.py
new file mode 100644
index 000000000..594d87344
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/__init__.py
@@ -0,0 +1,6 @@
+"""Load chat messages from common communications platforms for finetuning.
+
+This module provides functions to load chat messages from various
+communications platforms such as Facebook Messenger, Telegram, and
+WhatsApp. The loaded chat messages can be used for finetuning models.
+"""
diff --git a/libs/langchain/langchain/chat_loaders/base.py b/libs/langchain/langchain/chat_loaders/base.py
new file mode 100644
index 000000000..418ba15d2
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/base.py
@@ -0,0 +1,31 @@
+"""Base definitions for chat loaders.
+
+A chat loader is a class that loads chat messages from an external
+source such as a file or a database. The chat messages can then be
+used for finetuning.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Iterator, List, Sequence, TypedDict
+
+from langchain.schema.messages import BaseMessage
+
+
+class ChatSession(TypedDict):
+ """A chat session represents a single
+ conversation, channel, or other group of messages."""
+
+ messages: Sequence[BaseMessage]
+ """The LangChain chat messages loaded from the source."""
+
+
+class BaseChatLoader(ABC):
+ """Base class for chat loaders."""
+
+ @abstractmethod
+ def lazy_load(self) -> Iterator[ChatSession]:
+ """Lazy load the chat sessions."""
+
+ def load(self) -> List[ChatSession]:
+ """Eagerly load the chat sessions into memory."""
+ return list(self.lazy_load())
diff --git a/libs/langchain/langchain/chat_loaders/facebook_messenger.py b/libs/langchain/langchain/chat_loaders/facebook_messenger.py
new file mode 100644
index 000000000..5864c3274
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/facebook_messenger.py
@@ -0,0 +1,77 @@
+import json
+import logging
+from pathlib import Path
+from typing import Iterator, Union
+
+from langchain.chat_loaders.base import BaseChatLoader, ChatSession
+from langchain.schema.messages import HumanMessage
+
+logger = logging.getLogger(__file__)
+
+
+class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
+ """A chat loader for loading Facebook Messenger chat data from a single file.
+
+ Args:
+ path (Union[Path, str]): The path to the chat file.
+
+ Attributes:
+ path (Path): The path to the chat file.
+
+ """
+
+ def __init__(self, path: Union[Path, str]) -> None:
+ super().__init__()
+ self.file_path = path if isinstance(path, Path) else Path(path)
+
+ def lazy_load(self) -> Iterator[ChatSession]:
+ """Lazy loads the chat data from the file.
+
+ Yields:
+ ChatSession: A chat session containing the loaded messages.
+
+ """
+ with open(self.file_path) as f:
+ data = json.load(f)
+ sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
+ messages = []
+ for m in sorted_data:
+ messages.append(
+ HumanMessage(
+ content=m["content"], additional_kwargs={"sender": m["sender_name"]}
+ )
+ )
+ yield ChatSession(messages=messages)
+
+
+class FolderFacebookMessengerChatLoader(BaseChatLoader):
+ """A chat loader for loading Facebook Messenger chat data from a folder.
+
+ Args:
+ path (Union[str, Path]): The path to the directory
+ containing the chat files.
+
+ Attributes:
+ path (Path): The path to the directory containing the chat files.
+
+ """
+
+ def __init__(self, path: Union[str, Path]) -> None:
+ super().__init__()
+ self.directory_path = Path(path) if isinstance(path, str) else path
+
+ def lazy_load(self) -> Iterator[ChatSession]:
+ """Lazy loads the chat data from the folder.
+
+ Yields:
+ ChatSession: A chat session containing the loaded messages.
+
+ """
+ inbox_path = self.directory_path / "inbox"
+ for _dir in inbox_path.iterdir():
+ if _dir.is_dir():
+ for _file in _dir.iterdir():
+ if _file.suffix.lower() == ".json":
+ file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
+ for result in file_loader.lazy_load():
+ yield result
diff --git a/libs/langchain/langchain/chat_loaders/slack.py b/libs/langchain/langchain/chat_loaders/slack.py
new file mode 100644
index 000000000..261289bb4
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/slack.py
@@ -0,0 +1,84 @@
+import json
+import logging
+import re
+import zipfile
+from pathlib import Path
+from typing import Dict, Iterator, List, Union
+
+from langchain import schema
+from langchain.chat_loaders import base as chat_loaders
+
+logger = logging.getLogger(__name__)
+
+
+class SlackChatLoader(chat_loaders.BaseChatLoader):
+ def __init__(
+ self,
+ path: Union[str, Path],
+ ):
+ """
+ Initialize the chat loader with the path to the exported Slack dump zip file.
+
+ :param path: Path to the exported Slack dump zip file.
+ """
+ self.zip_path = path if isinstance(path, Path) else Path(path)
+ if not self.zip_path.exists():
+ raise FileNotFoundError(f"File {self.zip_path} not found")
+
+ def _load_single_chat_session(
+ self, messages: List[Dict]
+ ) -> chat_loaders.ChatSession:
+ results: List[Union[schema.AIMessage, schema.HumanMessage]] = []
+ previous_sender = None
+ for message in messages:
+ if not isinstance(message, dict):
+ continue
+ text = message.get("text", "")
+ timestamp = message.get("ts", "")
+ sender = message.get("user", "")
+ if not sender:
+ continue
+ skip_pattern = re.compile(
+ r"<@U\d+> has joined the channel", flags=re.IGNORECASE
+ )
+ if skip_pattern.match(text):
+ continue
+ if sender == previous_sender:
+ results[-1].content += "\n\n" + text
+ results[-1].additional_kwargs["events"].append(
+ {"message_time": timestamp}
+ )
+ else:
+ results.append(
+ schema.HumanMessage(
+ role=sender,
+ content=text,
+ additional_kwargs={
+ "sender": sender,
+ "events": [{"message_time": timestamp}],
+ },
+ )
+ )
+ previous_sender = sender
+ return chat_loaders.ChatSession(messages=results)
+
+ def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
+ """Read JSON data from a zip subfile."""
+ with zip_file.open(file_path, "r") as f:
+ data = json.load(f)
+ if not isinstance(data, list):
+ raise ValueError(f"Expected list of dictionaries, got {type(data)}")
+ return data
+
+ def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
+ """
+ Lazy load the chat sessions from the Slack dump file and yield them
+ in the required format.
+
+ :return: Iterator of chat sessions containing messages.
+ """
+ with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
+ for file_path in zip_file.namelist():
+ if file_path.endswith(".json"):
+ messages = self._read_json(zip_file, file_path)
+ yield self._load_single_chat_session(messages)
diff --git a/libs/langchain/langchain/chat_loaders/telegram.py b/libs/langchain/langchain/chat_loaders/telegram.py
new file mode 100644
index 000000000..786dad727
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/telegram.py
@@ -0,0 +1,152 @@
+import json
+import logging
+import os
+import zipfile
+from pathlib import Path
+from typing import Iterator, List, Union
+
+from langchain import schema
+from langchain.chat_loaders import base as chat_loaders
+
+logger = logging.getLogger(__name__)
+
+
+class TelegramChatLoader(chat_loaders.BaseChatLoader):
+ """A loading utility for converting telegram conversations
+ to LangChain chat messages.
+
+ To export, use the Telegram Desktop app from
+ https://desktop.telegram.org/, select a conversation, click the three dots
+ in the top right corner, and select "Export chat history". Then select
+ "Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
+ the desktop app (like "Telegram for MacOS") do not support exporting chat
+ history.
+ """
+
+ def __init__(
+ self,
+ path: Union[str, Path],
+ ):
+ """Initialize the TelegramChatLoader.
+
+ Args:
+ path (Union[str, Path]): Path to the exported Telegram chat zip,
+ directory, json, or HTML file.
+ """
+ self.path = path if isinstance(path, str) else str(path)
+
+ def _load_single_chat_session_html(
+ self, file_path: str
+ ) -> chat_loaders.ChatSession:
+ """Load a single chat session from an HTML file.
+
+ Args:
+ file_path (str): Path to the HTML file.
+
+ Returns:
+ chat_loaders.ChatSession: The loaded chat session.
+ """
+ try:
+ from bs4 import BeautifulSoup
+ except ImportError:
+ raise ImportError(
+ "Please install the 'beautifulsoup4' package to load"
+ " Telegram HTML files. You can do this by running"
+ "'pip install beautifulsoup4' in your terminal."
+ )
+ with open(file_path, "r", encoding="utf-8") as file:
+ soup = BeautifulSoup(file, "html.parser")
+
+ results: List[Union[schema.HumanMessage, schema.AIMessage]] = []
+ previous_sender = None
+ for message in soup.select(".message.default"):
+ timestamp = message.select_one(".pull_right.date.details")["title"]
+ from_name_element = message.select_one(".from_name")
+ if from_name_element is None and previous_sender is None:
+ logger.debug("from_name not found in message")
+ continue
+ elif from_name_element is None:
+ from_name = previous_sender
+ else:
+ from_name = from_name_element.text.strip()
+ text = message.select_one(".text").text.strip()
+ results.append(
+ schema.HumanMessage(
+ content=text,
+ additional_kwargs={
+ "sender": from_name,
+ "events": [{"message_time": timestamp}],
+ },
+ )
+ )
+ previous_sender = from_name
+
+ return chat_loaders.ChatSession(messages=results)
+
+ def _load_single_chat_session_json(
+ self, file_path: str
+ ) -> chat_loaders.ChatSession:
+ """Load a single chat session from a JSON file.
+
+ Args:
+ file_path (str): Path to the JSON file.
+
+ Returns:
+ chat_loaders.ChatSession: The loaded chat session.
+ """
+ with open(file_path, "r", encoding="utf-8") as file:
+ data = json.load(file)
+
+ messages = data.get("messages", [])
+ results: List[schema.BaseMessage] = []
+ for message in messages:
+ text = message.get("text", "")
+ timestamp = message.get("date", "")
+ from_name = message.get("from", "")
+
+ results.append(
+ schema.HumanMessage(
+ content=text,
+ additional_kwargs={
+ "sender": from_name,
+ "events": [{"message_time": timestamp}],
+ },
+ )
+ )
+
+ return chat_loaders.ChatSession(messages=results)
+
+ def _iterate_files(self, path: str) -> Iterator[str]:
+ """Iterate over files in a directory or zip file.
+
+ Args:
+ path (str): Path to the directory or zip file.
+
+ Yields:
+ str: Path to each file.
+ """
+ if os.path.isfile(path) and path.endswith((".html", ".json")):
+ yield path
+ elif os.path.isdir(path):
+ for root, _, files in os.walk(path):
+ for file in files:
+ if file.endswith((".html", ".json")):
+ yield os.path.join(root, file)
+ elif zipfile.is_zipfile(path):
+ with zipfile.ZipFile(path) as zip_file:
+ for file in zip_file.namelist():
+ if file.endswith((".html", ".json")):
+ yield zip_file.extract(file)
+
+ def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
+ """Lazy load the messages from the chat file and yield them
+ in as chat sessions.
+
+ Yields:
+ chat_loaders.ChatSession: The loaded chat session.
+ """
+ for file_path in self._iterate_files(self.path):
+ if file_path.endswith(".html"):
+ yield self._load_single_chat_session_html(file_path)
+ elif file_path.endswith(".json"):
+ yield self._load_single_chat_session_json(file_path)
diff --git a/libs/langchain/langchain/chat_loaders/utils.py b/libs/langchain/langchain/chat_loaders/utils.py
new file mode 100644
index 000000000..da41cfd23
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/utils.py
@@ -0,0 +1,86 @@
+"""Utilities for chat loaders."""
+from copy import deepcopy
+from typing import Iterable, Iterator, List
+
+from langchain import schema
+from langchain.chat_loaders.base import ChatSession
+from langchain.schema.messages import BaseMessage
+
+
+def merge_chat_runs_in_session(
+ chat_session: ChatSession, delimiter: str = "\n\n"
+) -> ChatSession:
+ """Merge chat runs together in a chat session.
+
+ A chat run is a sequence of messages from the same sender.
+
+ Args:
+ chat_session: A chat session.
+
+ Returns:
+ A chat session with merged chat runs.
+ """
+ messages: List[BaseMessage] = []
+ for message in chat_session["messages"]:
+ if not messages:
+ messages.append(deepcopy(message))
+ elif (
+ isinstance(message, type(messages[-1]))
+ and messages[-1].additional_kwargs.get("sender") is not None
+ and messages[-1].additional_kwargs["sender"]
+ == message.additional_kwargs.get("sender")
+ ):
+ messages[-1].content = (
+ messages[-1].content + delimiter + message.content
+ ).strip()
+ messages[-1].additional_kwargs.get("events", []).extend(
+ message.additional_kwargs.get("events") or []
+ )
+ else:
+ messages.append(deepcopy(message))
+ return ChatSession(messages=messages)
+
+
+def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
+ """Merge chat runs together.
+
+ A chat run is a sequence of messages from the same sender.
+
+ Args:
+ chat_sessions: A list of chat sessions.
+
+ Returns:
+ A list of chat sessions with merged chat runs.
+ """
+ for chat_session in chat_sessions:
+ yield merge_chat_runs_in_session(chat_session)
+
+
+def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
+ """Convert messages from the specified 'sender' to AI messages.
+
+ This is useful for fine-tuning the AI to adapt to your voice.
+ """
+ messages = []
+ num_converted = 0
+ for message in chat_sessions["messages"]:
+ if message.additional_kwargs.get("sender") == sender:
+ message = schema.AIMessage(
+ content=message.content,
+ additional_kwargs=message.additional_kwargs.copy(),
+ example=getattr(message, "example", None),
+ )
+ num_converted += 1
+ messages.append(message)
+ return ChatSession(messages=messages)
+
+
+def map_ai_messages(
+ chat_sessions: Iterable[ChatSession], sender: str
+) -> Iterator[ChatSession]:
+ """Convert messages from the specified 'sender' to AI messages.
+
+ This is useful for fine-tuning the AI to adapt to your voice.
+ """
+ for chat_session in chat_sessions:
+ yield map_ai_messages_in_session(chat_session, sender)
diff --git a/libs/langchain/langchain/chat_loaders/whatsapp.py b/libs/langchain/langchain/chat_loaders/whatsapp.py
new file mode 100644
index 000000000..c911e262c
--- /dev/null
+++ b/libs/langchain/langchain/chat_loaders/whatsapp.py
@@ -0,0 +1,116 @@
+import logging
+import os
+import re
+import zipfile
+from typing import Iterator, List, Union
+
+from langchain import schema
+from langchain.chat_loaders import base as chat_loaders
+from langchain.schema import messages
+
+logger = logging.getLogger(__name__)
+
+
+class WhatsAppChatLoader(chat_loaders.BaseChatLoader):
+ def __init__(self, path: str):
+ """Initialize the WhatsAppChatLoader.
+
+ Args:
+ path (str): Path to the exported WhatsApp chat
+ zip directory, folder, or file.
+
+ To generate the dump, open the chat, click the three dots in the top
+ right corner, and select "More". Then select "Export chat" and
+ choose "Without media".
+ """
+ self.path = path
+ ignore_lines = [
+ "This message was deleted",
+ "",
+ "image omitted",
+ "Messages and calls are end-to-end encrypted. No one outside of this chat,"
+ " not even WhatsApp, can read or listen to them.",
+ ]
+ self._ignore_lines = re.compile(
+ r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
+ flags=re.IGNORECASE,
+ )
+ self._message_line_regex = re.compile(
+ r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
+ flags=re.IGNORECASE,
+ )
+
+ def _load_single_chat_session(self, file_path: str) -> chat_loaders.ChatSession:
+ """Load a single chat session from a file.
+
+ Args:
+ file_path (str): Path to the chat file.
+
+ Returns:
+ ChatSession: The loaded chat session.
+ """
+ with open(file_path, "r", encoding="utf-8") as file:
+ txt = file.read()
+
+ # Split messages by newlines, but keep multi-line messages grouped
+ chat_lines: List[str] = []
+ current_message = ""
+ for line in txt.split("\n"):
+ if self._message_line_regex.match(line):
+ if current_message:
+ chat_lines.append(current_message)
+ current_message = line
+ else:
+ current_message += " " + line.strip()
+ if current_message:
+ chat_lines.append(current_message)
+ results: List[Union[messages.HumanMessage, messages.AIMessage]] = []
+ for line in chat_lines:
+ result = self._message_line_regex.match(line.strip())
+ if result:
+ timestamp, sender, text = result.groups()
+ if not self._ignore_lines.match(text.strip()):
+ results.append(
+ schema.HumanMessage(
+ role=sender,
+ content=text,
+ additional_kwargs={
+ "sender": sender,
+ "events": [{"message_time": timestamp}],
+ },
+ )
+ )
+ else:
+ logger.debug(f"Could not parse line: {line}")
+ return chat_loaders.ChatSession(messages=results)
+
+ def _iterate_files(self, path: str) -> Iterator[str]:
+ """Iterate over the files in a directory or zip file.
+
+ Args:
+ path (str): Path to the directory or zip file.
+
+ Yields:
+ str: The path to each file.
+ """
+ if os.path.isfile(path):
+ yield path
+ elif os.path.isdir(path):
+ for root, _, files in os.walk(path):
+ for file in files:
+ if file.endswith(".txt"):
+ yield os.path.join(root, file)
+ elif zipfile.is_zipfile(path):
+ with zipfile.ZipFile(path) as zip_file:
+ for file in zip_file.namelist():
+ if file.endswith(".txt"):
+ yield zip_file.extract(file)
+
+ def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
+ """Lazy load the messages from the chat file and yield
+ them as chat sessions.
+
+ Yields:
+ Iterator[ChatSession]: The loaded chat sessions.
+ """
+ yield self._load_single_chat_session(self.path)
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/__init__.py b/libs/langchain/tests/unit_tests/chat_loaders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_html/messages.html b/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_html/messages.html
new file mode 100644
index 000000000..a90ba5449
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_html/messages.html
@@ -0,0 +1,166 @@
+
+
+
+
+
+
+Exported Data
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Jimmeny Marvelton
+
+
+
+
+
+
+
+
+
+
+
+
+
+23 August 2023
+
+
+
+
+
+
+
+
+
+
+
+bA
+
+
+
+
+
+
+
+
+
+13:11
+
+
+
+Jimmeny Marvelton
+
+
+
+i refuse to converse with you
+
+
+
+
+
+
+
+
+
+
+
+
+
+WF
+
+
+
+
+
+
+
+
+
+13:13
+
+
+
+ Batman & Robin
+
+
+
+Hi nemesis
+
+
+
+
+
+
+
+
+
+
+
+
+
+bA
+
+
+
+
+
+
+
+
+
+13:15
+
+
+
+Jimmeny Marvelton
+
+
+
+we meet again
+
+
+
+
+
+
+
+
+
+
+
+13:15
+
+
+
+you will not trick me this time
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_json/result.json b/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_json/result.json
new file mode 100644
index 000000000..e40d6a1f4
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/data/telegram_chat_json/result.json
@@ -0,0 +1,67 @@
+{
+ "name": "Jimmeny",
+ "type": "personal_chat",
+ "id": 5965280513,
+ "messages": [
+ {
+ "id": 1,
+ "type": "message",
+ "date": "2023-08-23T13:11:23",
+ "date_unixtime": "1692821483",
+ "from": "Jimmeny Marvelton",
+ "from_id": "user123450513",
+ "text": "i refuse to converse with you",
+ "text_entities": [
+ {
+ "type": "plain",
+ "text": "i refuse to converse with you"
+ }
+ ]
+ },
+ {
+ "id": 2,
+ "type": "message",
+ "date": "2023-08-23T13:13:20",
+ "date_unixtime": "1692821600",
+ "from": "Batman & Robin",
+ "from_id": "user6565661032",
+ "text": "Hi nemesis",
+ "text_entities": [
+ {
+ "type": "plain",
+ "text": "Hi nemesis"
+ }
+ ]
+ },
+ {
+ "id": 3,
+ "type": "message",
+ "date": "2023-08-23T13:15:35",
+ "date_unixtime": "1692821735",
+ "from": "Jimmeny Marvelton",
+ "from_id": "user123450513",
+ "text": "we meet again",
+ "text_entities": [
+ {
+ "type": "plain",
+ "text": "we meet again"
+ }
+ ]
+ },
+ {
+ "id": 4,
+ "type": "message",
+ "date": "2023-08-23T13:15:53",
+ "date_unixtime": "1692821753",
+ "from": "Jimmeny Marvelton",
+ "from_id": "user123450513",
+ "text": "you will not trick me this time",
+ "text_entities": [
+ {
+ "type": "plain",
+ "text": "you will not trick me this time"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/data/whatsapp_chat.txt b/libs/langchain/tests/unit_tests/chat_loaders/data/whatsapp_chat.txt
new file mode 100644
index 000000000..54056c435
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/data/whatsapp_chat.txt
@@ -0,0 +1,12 @@
+[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.
+[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!
+[8/15/23, 9:12:48 AM] Dr. Feather: image omitted
+[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?
+[8/15/23, 9:13:23 AM] Dr. Feather: image omitted
+[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.
+[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?
+[8/15/23, 9:14:30 AM] Dr. Feather: image omitted
+[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.
+[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.
+[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.
+[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work.
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/test_slack.py b/libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
new file mode 100644
index 000000000..cdf569d60
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
@@ -0,0 +1,24 @@
+import pathlib
+
+from langchain.chat_loaders import slack, utils
+
+
+def test_slack_chat_loader() -> None:
+ chat_path = (
+ pathlib.Path(__file__).parents[2]
+ / "integration_tests"
+ / "examples"
+ / "slack_export.zip"
+ )
+ loader = slack.SlackChatLoader(str(chat_path))
+
+ chat_sessions = list(
+ utils.map_ai_messages(loader.lazy_load(), sender="U0500003428")
+ )
+ assert chat_sessions, "Chat sessions should not be empty"
+
+ assert chat_sessions[1]["messages"], "Chat messages should not be empty"
+
+ assert (
+ "Example message" in chat_sessions[1]["messages"][0].content
+ ), "Chat content mismatch"
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/test_telegram.py b/libs/langchain/tests/unit_tests/chat_loaders/test_telegram.py
new file mode 100644
index 000000000..7984adc19
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/test_telegram.py
@@ -0,0 +1,97 @@
+"""Test the telegram chat loader."""
+import pathlib
+import tempfile
+import zipfile
+from typing import Sequence
+
+import pytest
+
+from langchain import schema
+from langchain.chat_loaders import telegram, utils
+
+
+def _assert_messages_are_equal(
+ actual_messages: Sequence[schema.BaseMessage],
+ expected_messages: Sequence[schema.BaseMessage],
+) -> None:
+ assert len(actual_messages) == len(expected_messages)
+ for actual, expected in zip(actual_messages, expected_messages):
+ assert actual.content == expected.content
+ assert (
+ actual.additional_kwargs["sender"] == expected.additional_kwargs["sender"]
+ )
+
+
+def _check_telegram_chat_loader(path: str) -> None:
+ _data_dir = pathlib.Path(__file__).parent / "data"
+ source_path = _data_dir / path
+ # Create a zip file from the directory in a temp directory
+ with tempfile.TemporaryDirectory() as temp_dir_:
+ temp_dir = pathlib.Path(temp_dir_)
+ if path.endswith(".zip"):
+ # Make a new zip file
+ zip_path = temp_dir / "telegram_chat.zip"
+ with zipfile.ZipFile(zip_path, "w") as zip_file:
+ original_path = _data_dir / path.replace(".zip", "")
+ for file_path in original_path.iterdir():
+ zip_file.write(file_path, arcname=file_path.name)
+ source_path = zip_path
+ loader = telegram.TelegramChatLoader(str(source_path))
+ chat_sessions_ = loader.lazy_load()
+ chat_sessions_ = utils.merge_chat_runs(chat_sessions_)
+ chat_sessions = list(
+ utils.map_ai_messages(chat_sessions_, sender="Batman & Robin")
+ )
+ assert len(chat_sessions) == 1
+ session = chat_sessions[0]
+ assert len(session["messages"]) > 0
+ assert session["messages"][0].content == "i refuse to converse with you"
+ expected_content = [
+ schema.HumanMessage(
+ content="i refuse to converse with you",
+ additional_kwargs={
+ "sender": "Jimmeny Marvelton",
+ "events": [{"message_time": "23.08.2023 13:11:23 UTC-08:00"}],
+ },
+ ),
+ schema.AIMessage(
+ content="Hi nemesis",
+ additional_kwargs={
+ "sender": "Batman & Robin",
+ "events": [{"message_time": "23.08.2023 13:13:20 UTC-08:00"}],
+ },
+ ),
+ schema.HumanMessage(
+ content="we meet again\n\nyou will not trick me this time",
+ additional_kwargs={
+ "sender": "Jimmeny Marvelton",
+ "events": [{"message_time": "23.08.2023 13:15:35 UTC-08:00"}],
+ },
+ ),
+ ]
+ _assert_messages_are_equal(session["messages"], expected_content)
+
+
+@pytest.mark.parametrize(
+ "path",
+ [
+ "telegram_chat_json",
+ "telegram_chat_json.zip",
+ "telegram_chat_json/result.json",
+ ],
+)
+def test_telegram_chat_loader(path: str) -> None:
+ _check_telegram_chat_loader(path)
+
+
+@pytest.mark.skip(reason="requires bs4 but marking it as such doesn't seem to work")
+@pytest.mark.parametrize(
+ "path",
+ [
+ "telegram_chat_json",
+ "telegram_chat_json.zip",
+ "telegram_chat_json/result.json",
+ ],
+)
+def test_telegram_chat_loader_html(path: str) -> None:
+ _check_telegram_chat_loader(path)
diff --git a/libs/langchain/tests/unit_tests/chat_loaders/test_whatsapp.py b/libs/langchain/tests/unit_tests/chat_loaders/test_whatsapp.py
new file mode 100644
index 000000000..9263a80b1
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/chat_loaders/test_whatsapp.py
@@ -0,0 +1,20 @@
+import pathlib
+
+from langchain.chat_loaders import utils, whatsapp
+
+
+def test_whatsapp_chat_loader() -> None:
+ chat_path = pathlib.Path(__file__).parent / "data" / "whatsapp_chat.txt"
+ loader = whatsapp.WhatsAppChatLoader(str(chat_path))
+
+ chat_sessions = list(
+ utils.map_ai_messages(loader.lazy_load(), sender="Dr. Feather")
+ )
+ assert chat_sessions, "Chat sessions should not be empty"
+
+ assert chat_sessions[0]["messages"], "Chat messages should not be empty"
+
+ assert (
+ "I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest."
+ " Such a magnificent creature!" in chat_sessions[0]["messages"][0].content
+ ), "Chat content mismatch"
From dc30edf51ca7f53d8786276326907597880674cd Mon Sep 17 00:00:00 2001
From: Tudor Golubenco
Date: Fri, 25 Aug 2023 01:37:46 +0100
Subject: [PATCH 119/143] Xata as a chat message memory store (#9719)
This adds Xata as a memory store also to the python version of
LangChain, similar to the [one for
LangChain.js](https://github.com/hwchase17/langchainjs/pull/2217).
I have added a Jupyter Notebook with a simple and a more complex example
using an agent.
To run the integration test, you need to execute something like:
```
XATA_API_KEY='xau_...' XATA_DB_URL="https://demo-uni3q8.eu-west-1.xata.sh/db/langchain" poetry run pytest tests/integration_tests/memory/test_xata.py
```
Where `langchain` is the database you create in Xata.
---
.../memory/xata_chat_message_history.ipynb | 326 ++++++++++++++++++
libs/langchain/langchain/memory/__init__.py | 2 +
.../memory/chat_message_histories/__init__.py | 2 +
.../memory/chat_message_histories/xata.py | 132 +++++++
.../integration_tests/memory/test_xata.py | 41 +++
5 files changed, 503 insertions(+)
create mode 100644 docs/extras/integrations/memory/xata_chat_message_history.ipynb
create mode 100644 libs/langchain/langchain/memory/chat_message_histories/xata.py
create mode 100644 libs/langchain/tests/integration_tests/memory/test_xata.py
diff --git a/docs/extras/integrations/memory/xata_chat_message_history.ipynb b/docs/extras/integrations/memory/xata_chat_message_history.ipynb
new file mode 100644
index 000000000..938f6c44b
--- /dev/null
+++ b/docs/extras/integrations/memory/xata_chat_message_history.ipynb
@@ -0,0 +1,326 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Xata chat memory\n",
+ "\n",
+ "[Xata](https://xata.io) is a serverless data platform, based on PostgreSQL and Elasticsearch. It provides a Python SDK for interacting with your database, and a UI for managing your data. With the `XataChatMessageHistory` class, you can use Xata databases for longer-term persistence of chat sessions.\n",
+ "\n",
+ "This notebook covers:\n",
+ "\n",
+ "* A simple example showing what `XataChatMessageHistory` does.\n",
+ "* A more complex example using a REACT agent that answer questions based on a knowledge based or documentation (stored in Xata as a vector store) and also having a long-term searchable history of its past messages (stored in Xata as a memory store)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup\n",
+ "\n",
+ "### Create a database\n",
+ "\n",
+ "In the [Xata UI](https://app.xata.io) create a new database. You can name it whatever you want, in this notepad we'll use `langchain`. The Langchain integration can auto-create the table used for storying the memory, and this is what we'll use in this example. If you want to pre-create the table, ensure it has the right schema and set `create_table` to `False` when creating the class. Pre-creating the table saves one round-trip to the database during each session initialization."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's first install our dependencies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install xata==1.0.0rc0 openai langchain"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, we need to get the environment variables for Xata. You can create a new API key by visiting your [account settings](https://app.xata.io/settings). To find the database URL, go to the Settings page of the database that you have created. The database URL should look something like this: `https://demo-uni3q8.eu-west-1.xata.sh/db/langchain`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import getpass\n",
+ "\n",
+ "api_key = getpass.getpass(\"Xata API key: \")\n",
+ "db_url = input(\"Xata database URL (copy it from your DB settings):\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a simple memory store\n",
+ "\n",
+ "To test the memory store functionality in isolation, let's use the following code snippet:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.memory import XataChatMessageHistory\n",
+ "\n",
+ "history = XataChatMessageHistory(\n",
+ " session_id=\"session-1\",\n",
+ " api_key=api_key,\n",
+ " db_url=db_url,\n",
+ " table_name=\"memory\"\n",
+ ")\n",
+ "\n",
+ "history.add_user_message(\"hi!\")\n",
+ "\n",
+ "history.add_ai_message(\"whats up?\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The above code creates a session with the ID `session-1` and stores two messages in it. After running the above, if you visit the Xata UI, you should see a table named `memory` and the two messages added to it.\n",
+ "\n",
+ "You can retrieve the message history for a particular session with the following code:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "history.messages"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Conversational Q&A chain on your data with memory\n",
+ "\n",
+ "Let's now see a more complex example in which we combine OpenAI, the Xata Vector Store integration, and the Xata memory store integration to create a Q&A chat bot on your data, with follow-up questions and history."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We're going to need to access the OpenAI API, so let's configure the API key:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To store the documents that the chatbot will search for answers, add a table named `docs` to your `langchain` database using the Xata UI, and add the following columns:\n",
+ "\n",
+ "* `content` of type \"Text\". This is used to store the `Document.pageContent` values.\n",
+ "* `embedding` of type \"Vector\". Use the dimension used by the model you plan to use. In this notebook we use OpenAI embeddings, which have 1536 dimensions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's create the vector store and add some sample docs to it:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+ "from langchain.vectorstores.xata import XataVectorStore\n",
+ "\n",
+ "embeddings = OpenAIEmbeddings()\n",
+ "\n",
+ "texts = [\n",
+ " \"Xata is a Serverless Data platform based on PostgreSQL\",\n",
+ " \"Xata offers a built-in vector type that can be used to store and query vectors\",\n",
+ " \"Xata includes similarity search\"\n",
+ "]\n",
+ "\n",
+ "vector_store = XataVectorStore.from_texts(texts, embeddings, api_key=api_key, db_url=db_url, table_name=\"docs\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After running the above command, if you go to the Xata UI, you should see the documents loaded together with their embeddings in the `docs` table."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's now create a ConversationBufferMemory to store the chat messages from both the user and the AI."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.memory import ConversationBufferMemory\n",
+ "from uuid import uuid4\n",
+ "\n",
+ "chat_memory = XataChatMessageHistory(\n",
+ " session_id=str(uuid4()), # needs to be unique per user session\n",
+ " api_key=api_key,\n",
+ " db_url=db_url,\n",
+ " table_name=\"memory\"\n",
+ ")\n",
+ "memory = ConversationBufferMemory(memory_key=\"chat_history\", chat_memory=chat_memory, return_messages=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now it's time to create an Agent to use both the vector store and the chat memory together."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.agents import initialize_agent, AgentType\n",
+ "from langchain.agents.agent_toolkits import create_retriever_tool\n",
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "tool = create_retriever_tool(\n",
+ " vector_store.as_retriever(), \n",
+ " \"search_docs\",\n",
+ " \"Searches and returns documents from the Xata manual. Useful when you need to answer questions about Xata.\"\n",
+ ")\n",
+ "tools = [tool]\n",
+ "\n",
+ "llm = ChatOpenAI(temperature=0)\n",
+ "\n",
+ "agent = initialize_agent(\n",
+ " tools,\n",
+ " llm,\n",
+ " agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n",
+ " verbose=True,\n",
+ " memory=memory)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To test, let's tell the agent our name:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agent.run(input=\"My name is bob\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, let's now ask the agent some questions about Xata:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agent.run(input=\"What is xata?\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notice that it answers based on the data stored in the document store. And now, let's ask a follow up question:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agent.run(input=\"Does it support similarity search?\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And now let's test its memory:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agent.run(input=\"Did I tell you my name? What is it?\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/libs/langchain/langchain/memory/__init__.py b/libs/langchain/langchain/memory/__init__.py
index e0ac6371f..f2a61ef2f 100644
--- a/libs/langchain/langchain/memory/__init__.py
+++ b/libs/langchain/langchain/memory/__init__.py
@@ -43,6 +43,7 @@ from langchain.memory.chat_message_histories import (
RedisChatMessageHistory,
SQLChatMessageHistory,
StreamlitChatMessageHistory,
+ XataChatMessageHistory,
ZepChatMessageHistory,
)
from langchain.memory.combined import CombinedMemory
@@ -90,6 +91,7 @@ __all__ = [
"SimpleMemory",
"StreamlitChatMessageHistory",
"VectorStoreRetrieverMemory",
+ "XataChatMessageHistory",
"ZepChatMessageHistory",
"ZepMemory",
]
diff --git a/libs/langchain/langchain/memory/chat_message_histories/__init__.py b/libs/langchain/langchain/memory/chat_message_histories/__init__.py
index 02241675b..ddd23de4f 100644
--- a/libs/langchain/langchain/memory/chat_message_histories/__init__.py
+++ b/libs/langchain/langchain/memory/chat_message_histories/__init__.py
@@ -17,6 +17,7 @@ from langchain.memory.chat_message_histories.sql import SQLChatMessageHistory
from langchain.memory.chat_message_histories.streamlit import (
StreamlitChatMessageHistory,
)
+from langchain.memory.chat_message_histories.xata import XataChatMessageHistory
from langchain.memory.chat_message_histories.zep import ZepChatMessageHistory
__all__ = [
@@ -33,5 +34,6 @@ __all__ = [
"RocksetChatMessageHistory",
"SQLChatMessageHistory",
"StreamlitChatMessageHistory",
+ "XataChatMessageHistory",
"ZepChatMessageHistory",
]
diff --git a/libs/langchain/langchain/memory/chat_message_histories/xata.py b/libs/langchain/langchain/memory/chat_message_histories/xata.py
new file mode 100644
index 000000000..de358888a
--- /dev/null
+++ b/libs/langchain/langchain/memory/chat_message_histories/xata.py
@@ -0,0 +1,132 @@
+import json
+from typing import List
+
+from langchain.schema import (
+ BaseChatMessageHistory,
+)
+from langchain.schema.messages import BaseMessage, _message_to_dict, messages_from_dict
+
+
+class XataChatMessageHistory(BaseChatMessageHistory):
+ """Chat message history stored in a Xata database."""
+
+ def __init__(
+ self,
+ session_id: str,
+ db_url: str,
+ api_key: str,
+ branch_name: str = "main",
+ table_name: str = "messages",
+ create_table: bool = True,
+ ) -> None:
+ """Initialize with Xata client."""
+ try:
+ from xata.client import XataClient # noqa: F401
+ except ImportError:
+ raise ValueError(
+ "Could not import xata python package. "
+ "Please install it with `pip install xata`."
+ )
+ self._client = XataClient(
+ api_key=api_key, db_url=db_url, branch_name=branch_name
+ )
+ self._table_name = table_name
+ self._session_id = session_id
+
+ if create_table:
+ self._create_table_if_not_exists()
+
+ def _create_table_if_not_exists(self) -> None:
+ r = self._client.table().get_schema(self._table_name)
+ if r.status_code <= 299:
+ return
+ if r.status_code != 404:
+ raise Exception(
+ f"Error checking if table exists in Xata: {r.status_code} {r}"
+ )
+ r = self._client.table().create(self._table_name)
+ if r.status_code > 299:
+ raise Exception(f"Error creating table in Xata: {r.status_code} {r}")
+ r = self._client.table().set_schema(
+ self._table_name,
+ payload={
+ "columns": [
+ {"name": "sessionId", "type": "string"},
+ {"name": "type", "type": "string"},
+ {"name": "role", "type": "string"},
+ {"name": "content", "type": "text"},
+ {"name": "name", "type": "string"},
+ {"name": "additionalKwargs", "type": "text"},
+ ]
+ },
+ )
+ if r.status_code > 299:
+ raise Exception(f"Error setting table schema in Xata: {r.status_code} {r}")
+
+ def add_message(self, message: BaseMessage) -> None:
+ """Append the message to the Xata table"""
+ msg = _message_to_dict(message)
+ r = self._client.records().insert(
+ self._table_name,
+ {
+ "sessionId": self._session_id,
+ "type": msg["type"],
+ "content": message.content,
+ "additionalKwargs": json.dumps(message.additional_kwargs),
+ "role": msg["data"].get("role"),
+ "name": msg["data"].get("name"),
+ },
+ )
+ if r.status_code > 299:
+ raise Exception(f"Error adding message to Xata: {r.status_code} {r}")
+
+ @property
+ def messages(self) -> List[BaseMessage]: # type: ignore
+ r = self._client.data().query(
+ self._table_name,
+ payload={
+ "filter": {
+ "sessionId": self._session_id,
+ },
+ "sort": {"xata.createdAt": "asc"},
+ },
+ )
+ if r.status_code != 200:
+ raise Exception(f"Error running query: {r.status_code} {r}")
+ msgs = messages_from_dict(
+ [
+ {
+ "type": m["type"],
+ "data": {
+ "content": m["content"],
+ "role": m.get("role"),
+ "name": m.get("name"),
+ "additionalKwargs": json.loads(m["additionalKwargs"]),
+ },
+ }
+ for m in r["records"]
+ ]
+ )
+ return msgs
+
+ def clear(self) -> None:
+ """Delete session from Xata table."""
+ while True:
+ r = self._client.data().query(
+ self._table_name,
+ payload={
+ "columns": ["id"],
+ "filter": {
+ "sessionId": self._session_id,
+ },
+ },
+ )
+ if r.status_code != 200:
+ raise Exception(f"Error running query: {r.status_code} {r}")
+ ids = [rec["id"] for rec in r["records"]]
+ if len(ids) == 0:
+ break
+ operations = [
+ {"delete": {"table": self._table_name, "id": id}} for id in ids
+ ]
+ self._client.records().transaction(payload={"operations": operations})
diff --git a/libs/langchain/tests/integration_tests/memory/test_xata.py b/libs/langchain/tests/integration_tests/memory/test_xata.py
new file mode 100644
index 000000000..88bd158a2
--- /dev/null
+++ b/libs/langchain/tests/integration_tests/memory/test_xata.py
@@ -0,0 +1,41 @@
+"""Test Xata chat memory store functionality.
+
+Before running this test, please create a Xata database.
+"""
+
+import json
+import os
+
+from langchain.memory import ConversationBufferMemory
+from langchain.memory.chat_message_histories import XataChatMessageHistory
+from langchain.schema.messages import _message_to_dict
+
+
+class TestXata:
+ @classmethod
+ def setup_class(cls) -> None:
+ assert os.getenv("XATA_API_KEY"), "XATA_API_KEY environment variable is not set"
+ assert os.getenv("XATA_DB_URL"), "XATA_DB_URL environment variable is not set"
+
+ def test_xata_chat_memory(self) -> None:
+ message_history = XataChatMessageHistory(
+ api_key=os.getenv("XATA_API_KEY", ""),
+ db_url=os.getenv("XATA_DB_URL", ""),
+ session_id="integration-test-session",
+ )
+ memory = ConversationBufferMemory(
+ memory_key="baz", chat_memory=message_history, return_messages=True
+ )
+ # add some messages
+ memory.chat_memory.add_ai_message("This is me, the AI")
+ memory.chat_memory.add_user_message("This is me, the human")
+
+ # get the message history from the memory store and turn it into a json
+ messages = memory.chat_memory.messages
+ messages_json = json.dumps([_message_to_dict(msg) for msg in messages])
+
+ assert "This is me, the AI" in messages_json
+ assert "This is me, the human" in messages_json
+
+ # remove the record from Redis, so the next test run won't pick it up
+ memory.chat_memory.clear()
From 3e5cda3405ec1aa369fe90253d88f3e26a03db10 Mon Sep 17 00:00:00 2001
From: Erick Friis
Date: Thu, 24 Aug 2023 17:41:54 -0700
Subject: [PATCH 120/143] Hub Push Ergonomics (#9731)
Improves the hub pushing experience, returning a url instead of just a
commit hash.
Requires hub sdk 0.1.8
---
libs/langchain/langchain/hub.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/libs/langchain/langchain/hub.py b/libs/langchain/langchain/hub.py
index 21e7348ea..0c0d2d0fe 100644
--- a/libs/langchain/langchain/hub.py
+++ b/libs/langchain/langchain/hub.py
@@ -34,7 +34,7 @@ def push(
new_repo_description: str = "",
) -> str:
"""
- Pushes an object to the hub and returns the new commit hash.
+ Pushes an object to the hub and returns the URL it can be viewed at in a browser.
:param repo_full_name: The full name of the repo to push to in the format of
`owner/repo`.
@@ -51,15 +51,14 @@ def push(
"""
client = _get_client(api_url=api_url, api_key=api_key)
manifest_json = dumps(object)
- resp = client.push(
+ message = client.push(
repo_full_name,
manifest_json,
parent_commit_hash=parent_commit_hash,
new_repo_is_public=new_repo_is_public,
new_repo_description=new_repo_description,
)
- commit_hash: str = resp["commit"]["commit_hash"]
- return commit_hash
+ return message
def pull(
From adb21782b8e881ed6e3b5d1388e5af710c4b70d1 Mon Sep 17 00:00:00 2001
From: Naama Magami <58214719+NaamaMagami@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:09:30 +0300
Subject: [PATCH 121/143] Add del vector pgvector + adding modification time to
confluence and google drive docs (#9604)
Description:
- adding implementation of delete for pgvector
- adding modification time in docs metadata for confluence and google
drive.
Issue:
https://github.com/langchain-ai/langchain/issues/9312
Tag maintainer: @baskaryan, @eyurtsev, @hwchase17, @rlancemartin.
---------
Co-authored-by: Eugene Yurtsev
---
.../langchain/document_loaders/confluence.py | 19 ++++++++----
.../langchain/document_loaders/googledrive.py | 7 ++++-
.../langchain/vectorstores/pgvector.py | 30 +++++++++++++++++++
.../vectorstores/test_pgvector.py | 28 +++++++++++++++++
4 files changed, 77 insertions(+), 7 deletions(-)
diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py
index 739f52f48..5a12e8984 100644
--- a/libs/langchain/langchain/document_loaders/confluence.py
+++ b/libs/langchain/langchain/document_loaders/confluence.py
@@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader):
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id)
- page = get_page(page_id=page_id, expand=content_format.value)
+ page = get_page(
+ page_id=page_id, expand=f"{content_format.value},version"
+ )
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(
@@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader):
]
text = text + "".join(comment_texts)
+ metadata = {
+ "title": page["title"],
+ "id": page["id"],
+ "source": self.base_url.strip("/") + page["_links"]["webui"],
+ }
+
+ if "version" in page and "when" in page["version"]:
+ metadata["when"] = page["version"]["when"]
+
return Document(
page_content=text,
- metadata={
- "title": page["title"],
- "id": page["id"],
- "source": self.base_url.strip("/") + page["_links"]["webui"],
- },
+ metadata=metadata,
)
def process_attachment(
diff --git a/libs/langchain/langchain/document_loaders/googledrive.py b/libs/langchain/langchain/document_loaders/googledrive.py
index 881bafbd5..513f9bba7 100644
--- a/libs/langchain/langchain/document_loaders/googledrive.py
+++ b/libs/langchain/langchain/document_loaders/googledrive.py
@@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
- file = service.files().get(fileId=id, supportsAllDrives=True).execute()
+ file = (
+ service.files()
+ .get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name")
+ .execute()
+ )
request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request)
@@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
metadata = {
"source": f"https://docs.google.com/document/d/{id}/edit",
"title": f"{file.get('name')}",
+ "when": f"{file.get('modifiedTime')}",
}
return Document(page_content=text, metadata=metadata)
diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py
index a86a88cb1..6b02fc19c 100644
--- a/libs/langchain/langchain/vectorstores/pgvector.py
+++ b/libs/langchain/langchain/vectorstores/pgvector.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import contextlib
import enum
import logging
import uuid
@@ -8,6 +9,7 @@ from typing import (
Any,
Callable,
Dict,
+ Generator,
Iterable,
List,
Optional,
@@ -16,6 +18,7 @@ from typing import (
)
import sqlalchemy
+from sqlalchemy import delete
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Session, declarative_base
@@ -170,6 +173,33 @@ class PGVector(VectorStore):
session.delete(collection)
session.commit()
+ @contextlib.contextmanager
+ def _make_session(self) -> Generator[Session, None, None]:
+ """Create a context manager for the session, bind to _conn string."""
+ yield Session(self._conn)
+
+ def delete(
+ self,
+ ids: Optional[List[str]] = None,
+ **kwargs: Any,
+ ) -> None:
+ """Delete vectors by ids or uuids.
+
+ Args:
+ ids: List of ids to delete.
+ """
+ with Session(self._conn) as session:
+ if ids is not None:
+ self.logger.debug(
+ "Trying to delete vectors by ids (represented by the model "
+ "using the custom ids field)"
+ )
+ stmt = delete(self.EmbeddingStore).where(
+ self.EmbeddingStore.custom_id.in_(ids)
+ )
+ session.execute(stmt)
+ session.commit()
+
def get_collection(self, session: Session) -> Optional["CollectionStore"]:
return self.CollectionStore.get_by_name(session, self.collection_name)
diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py
index 46c8f11e1..6d6028497 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py
@@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None:
]
+def test_pgvector_delete_docs() -> None:
+ """Add and delete documents."""
+ texts = ["foo", "bar", "baz"]
+ metadatas = [{"page": str(i)} for i in range(len(texts))]
+ docsearch = PGVector.from_texts(
+ texts=texts,
+ collection_name="test_collection_filter",
+ embedding=FakeEmbeddingsWithAdaDimension(),
+ metadatas=metadatas,
+ ids=["1", "2", "3"],
+ connection_string=CONNECTION_STRING,
+ pre_delete_collection=True,
+ )
+ docsearch.delete(["1", "2"])
+ with docsearch._make_session() as session:
+ records = list(session.query(docsearch.EmbeddingStore).all())
+ # ignoring type error since mypy cannot determine whether
+ # the list is sortable
+ assert sorted(record.custom_id for record in records) == ["3"] # type: ignore
+
+ docsearch.delete(["2", "3"]) # Should not raise on missing ids
+ with docsearch._make_session() as session:
+ records = list(session.query(docsearch.EmbeddingStore).all())
+ # ignoring type error since mypy cannot determine whether
+ # the list is sortable
+ assert sorted(record.custom_id for record in records) == [] # type: ignore
+
+
def test_pgvector_relevance_score() -> None:
"""Test to make sure the relevance score is scaled to 0-1."""
texts = ["foo", "bar", "baz"]
From 87da56fb1e11f6102ae1ffd3a8878afee2958248 Mon Sep 17 00:00:00 2001
From: Leonid Kuligin
Date: Fri, 25 Aug 2023 06:44:49 +0200
Subject: [PATCH 122/143] Added a pdf parser based on DocAI (#9579)
#9578
---------
Co-authored-by: Leonid Kuligin
Co-authored-by: Eugene Yurtsev
---
.../document_transformers/docai.ipynb | 283 +++++++++++++++++
.../document_loaders/parsers/__init__.py | 2 +
.../document_loaders/parsers/docai.py | 292 ++++++++++++++++++
.../parsers/test_public_api.py | 1 +
4 files changed, 578 insertions(+)
create mode 100644 docs/extras/integrations/document_transformers/docai.ipynb
create mode 100644 libs/langchain/langchain/document_loaders/parsers/docai.py
diff --git a/docs/extras/integrations/document_transformers/docai.ipynb b/docs/extras/integrations/document_transformers/docai.ipynb
new file mode 100644
index 000000000..8cf81ff67
--- /dev/null
+++ b/docs/extras/integrations/document_transformers/docai.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "48438efb-9f0d-473b-a91c-9f1e29c2539d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders.blob_loaders import Blob\n",
+ "from langchain.document_loaders.parsers import DocAIParser"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f95ac25b-f025-40c3-95b8-77919fc4da7f",
+ "metadata": {},
+ "source": [
+ "DocAI is a Google Cloud platform to transform unstructured data from documents into structured data, making it easier to understand, analyze, and consume. You can read more about it: https://cloud.google.com/document-ai/docs/overview "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51946817-798c-4d11-abd6-db2ae53a0270",
+ "metadata": {},
+ "source": [
+ "First, you need to set up a GCS bucket and create your own OCR processor as described here: https://cloud.google.com/document-ai/docs/create-processor\n",
+ "The GCS_OUTPUT_PATH should be a path to a folder on GCS (starting with `gs://`) and a processor name should look like `projects/PROJECT_NUMBER/locations/LOCATION/processors/PROCESSOR_ID`. You can get it either programmatically or copy from the `Prediction endpoint` section of the `Processor details` tab in the Google Cloud Console."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ac85f7f3-3ef6-41d5-920a-b55f2939c202",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PROJECT = \"PUT_SOMETHING_HERE\"\n",
+ "GCS_OUTPUT_PATH = \"PUT_SOMETHING_HERE\"\n",
+ "PROCESSOR_NAME = \"PUT_SOMETHING_HERE\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fad2bcca-1c0e-4888-b82d-15823ba57e60",
+ "metadata": {},
+ "source": [
+ "Now, let's create a parser:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "dcc0c65a-86c5-448d-8b21-2e564b1903b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "parser = DocAIParser(location=\"us\", processor_name=PROCESSOR_NAME, gcs_output_path=GCS_OUTPUT_PATH)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8b5a3ff-650a-4ad3-a73a-395f86e4c9e1",
+ "metadata": {},
+ "source": [
+ "Let's go and parse an Alphabet's take from here: https://abc.xyz/assets/a7/5b/9e5ae0364b12b4c883f3cf748226/goog-exhibit-99-1-q1-2023-19.pdf. Copy it to your GCS bucket first, and adjust the path below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "373cc18e-a311-4c8d-8180-47e4ade1d2ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "blob = Blob(path=\"gs://vertex-pgt/examples/goog-exhibit-99-1-q1-2023-19.pdf\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "6ef84fad-2981-456d-a6b4-3a6a1a46d511",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = list(parser.lazy_parse(blob))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f8e4ee1-e07d-4c29-a120-4d56aae91859",
+ "metadata": {},
+ "source": [
+ "We'll get one document per page, 11 in total:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "343919f5-35d2-47fb-9790-de464649ebdf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(docs))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b104ae56-011b-4abe-ac07-e999c69494c5",
+ "metadata": {},
+ "source": [
+ "You can run end-to-end parsing of a blob one-by-one. If you have many documents, it might be a better approach to batch them together and maybe even detach parsing from handling the results of parsing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "9ecc1b99-5cef-47b0-a125-dbb2c41d2224",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['projects/543079149601/locations/us/operations/16447136779727347991']\n"
+ ]
+ }
+ ],
+ "source": [
+ "operations = parser.docai_parse([blob])\n",
+ "print([op.operation.name for op in operations])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2d24d63-c2c7-454c-9df3-2a9cf51309a6",
+ "metadata": {},
+ "source": [
+ "You can check whether operations are finished:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "ab11efb0-e514-4f44-9ba5-3d638a59c9e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "parser.is_running(operations)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "602ca0bc-080a-4a4e-a413-0e705aeab189",
+ "metadata": {},
+ "source": [
+ "And when they're finished, you can parse the results:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "ec1e6041-bc10-47d4-ba64-d09055c14f27",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "parser.is_running(operations)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "95d89da4-1c8a-413d-8473-ddd4a39375a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DocAIParsingResults(source_path='gs://vertex-pgt/examples/goog-exhibit-99-1-q1-2023-19.pdf', parsed_path='gs://vertex-pgt/test/run1/16447136779727347991/0')\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = parser.get_results(operations)\n",
+ "print(results[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "87e5b606-1679-46c7-9577-4cf9bc93a752",
+ "metadata": {},
+ "source": [
+ "And now we can finally generate Documents from parsed results:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "08e8878d-889b-41ad-9500-2f772d38782f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = list(parser.parse_from_results(results))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "c59525fb-448d-444b-8f12-c4aea791e19b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(docs))"
+ ]
+ }
+ ],
+ "metadata": {
+ "environment": {
+ "kernel": "python3",
+ "name": "common-cpu.m109",
+ "type": "gcloud",
+ "uri": "gcr.io/deeplearning-platform-release/base-cpu:m109"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/langchain/langchain/document_loaders/parsers/__init__.py b/libs/langchain/langchain/document_loaders/parsers/__init__.py
index 5d4843e9a..e2233e5cc 100644
--- a/libs/langchain/langchain/document_loaders/parsers/__init__.py
+++ b/libs/langchain/langchain/document_loaders/parsers/__init__.py
@@ -1,4 +1,5 @@
from langchain.document_loaders.parsers.audio import OpenAIWhisperParser
+from langchain.document_loaders.parsers.docai import DocAIParser
from langchain.document_loaders.parsers.grobid import GrobidParser
from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.language import LanguageParser
@@ -12,6 +13,7 @@ from langchain.document_loaders.parsers.pdf import (
__all__ = [
"BS4HTMLParser",
+ "DocAIParser",
"GrobidParser",
"LanguageParser",
"OpenAIWhisperParser",
diff --git a/libs/langchain/langchain/document_loaders/parsers/docai.py b/libs/langchain/langchain/document_loaders/parsers/docai.py
new file mode 100644
index 000000000..dd6913ac6
--- /dev/null
+++ b/libs/langchain/langchain/document_loaders/parsers/docai.py
@@ -0,0 +1,292 @@
+"""Module contains a PDF parser based on DocAI from Google Cloud.
+
+You need to install two libraries to use this parser:
+pip install google-cloud-documentai
+pip install google-cloud-documentai-toolbox
+"""
+import logging
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.utils.iter import batch_iterate
+
+if TYPE_CHECKING:
+ from google.api_core.operation import Operation
+ from google.cloud.documentai import DocumentProcessorServiceClient
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DocAIParsingResults:
+ """A dataclass to store DocAI parsing results."""
+
+ source_path: str
+ parsed_path: str
+
+
+class DocAIParser(BaseBlobParser):
+ def __init__(
+ self,
+ *,
+ client: Optional["DocumentProcessorServiceClient"] = None,
+ location: Optional[str] = None,
+ gcs_output_path: Optional[str] = None,
+ processor_name: Optional[str] = None,
+ ):
+ """Initializes the parser.
+
+ Args:
+ client: a DocumentProcessorServiceClient to use
+ location: a GCP location where a DOcAI parser is located
+ gcs_output_path: a path on GCS to store parsing results
+ processor_name: name of a processor
+
+ You should provide either a client or location (and then a client
+ would be instantiated).
+ """
+ if client and location:
+ raise ValueError(
+ "You should provide either a client or a location but not both "
+ "of them."
+ )
+ if not client and not location:
+ raise ValueError(
+ "You must specify either a client or a location to instantiate "
+ "a client."
+ )
+
+ self._gcs_output_path = gcs_output_path
+ self._processor_name = processor_name
+ if client:
+ self._client = client
+ else:
+ try:
+ from google.api_core.client_options import ClientOptions
+ from google.cloud.documentai import DocumentProcessorServiceClient
+ except ImportError:
+ raise ImportError(
+ "documentai package not found, please install it with"
+ " `pip install google-cloud-documentai`"
+ )
+ options = ClientOptions(
+ api_endpoint=f"{location}-documentai.googleapis.com"
+ )
+ self._client = DocumentProcessorServiceClient(client_options=options)
+
+ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+ """Parses a blob lazily.
+
+ Args:
+ blobs: a Blob to parse
+
+ This is a long-running operations! A recommended way is to batch
+ documents together and use `batch_parse` method.
+ """
+ yield from self.batch_parse([blob], gcs_output_path=self._gcs_output_path)
+
+ def batch_parse(
+ self,
+ blobs: Sequence[Blob],
+ gcs_output_path: Optional[str] = None,
+ timeout_sec: int = 3600,
+ check_in_interval_sec: int = 60,
+ ) -> Iterator[Document]:
+ """Parses a list of blobs lazily.
+
+ Args:
+ blobs: a list of blobs to parse
+ gcs_output_path: a path on GCS to store parsing results
+ timeout_sec: a timeout to wait for DocAI to complete, in seconds
+ check_in_interval_sec: an interval to wait until next check
+ whether parsing operations have been completed, in seconds
+ This is a long-running operations! A recommended way is to decouple
+ parsing from creating Langchain Documents:
+ >>> operations = parser.docai_parse(blobs, gcs_path)
+ >>> parser.is_running(operations)
+ You can get operations names and save them:
+ >>> names = [op.operation.name for op in operations]
+ And when all operations are finished, you can use their results:
+ >>> operations = parser.operations_from_names(operation_names)
+ >>> results = parser.get_results(operations)
+ >>> docs = parser.parse_from_results(results)
+ """
+ output_path = gcs_output_path if gcs_output_path else self._gcs_output_path
+ if output_path is None:
+ raise ValueError("An output path on GCS should be provided!")
+ operations = self.docai_parse(blobs, gcs_output_path=output_path)
+ operation_names = [op.operation.name for op in operations]
+ logger.debug(
+ f"Started parsing with DocAI, submitted operations {operation_names}"
+ )
+ is_running, time_elapsed = True, 0
+ while is_running:
+ is_running = self.is_running(operations)
+ if not is_running:
+ break
+ time.sleep(check_in_interval_sec)
+ time_elapsed += check_in_interval_sec
+ if time_elapsed > timeout_sec:
+ raise ValueError(
+ "Timeout exceeded! Check operations " f"{operation_names} later!"
+ )
+ logger.debug(".")
+
+ results = self.get_results(operations=operations)
+ yield from self.parse_from_results(results)
+
+ def parse_from_results(
+ self, results: List[DocAIParsingResults]
+ ) -> Iterator[Document]:
+ try:
+ from google.cloud.documentai_toolbox.wrappers.document import _get_shards
+ from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
+ except ImportError:
+ raise ImportError(
+ "documentai_toolbox package not found, please install it with"
+ " `pip install google-cloud-documentai-toolbox`"
+ )
+ for result in results:
+ output_gcs = result.parsed_path.split("/")
+ gcs_bucket_name = output_gcs[2]
+ gcs_prefix = "/".join(output_gcs[3:]) + "/"
+ shards = _get_shards(gcs_bucket_name, gcs_prefix)
+ docs, page_number = [], 1
+ for shard in shards:
+ for page in shard.pages:
+ docs.append(
+ Document(
+ page_content=_text_from_layout(page.layout, shard.text),
+ metadata={
+ "page": page_number,
+ "source": result.source_path,
+ },
+ )
+ )
+ page_number += 1
+ yield from docs
+
+ def operations_from_names(self, operation_names: List[str]) -> List["Operation"]:
+ """Initializes Long-Running Operations from their names."""
+ try:
+ from google.longrunning.operations_pb2 import (
+ GetOperationRequest, # type: ignore
+ )
+ except ImportError:
+ raise ImportError(
+ "documentai package not found, please install it with"
+ " `pip install gapic-google-longrunning`"
+ )
+
+ operations = []
+ for name in operation_names:
+ request = GetOperationRequest(name=name)
+ operations.append(self._client.get_operation(request=request))
+ return operations
+
+ def is_running(self, operations: List["Operation"]) -> bool:
+ for op in operations:
+ if not op.done():
+ return True
+ return False
+
+ def docai_parse(
+ self,
+ blobs: Sequence[Blob],
+ *,
+ gcs_output_path: Optional[str] = None,
+ batch_size: int = 4000,
+ enable_native_pdf_parsing: bool = True,
+ ) -> List["Operation"]:
+ """Runs Google DocAI PDF parser on a list of blobs.
+
+ Args:
+ blobs: a list of blobs to be parsed
+ gcs_output_path: a path (folder) on GCS to store results
+ batch_size: amount of documents per batch
+ enable_native_pdf_parsing: a config option for the parser
+
+ DocAI has a limit on the amount of documents per batch, that's why split a
+ batch into mini-batches. Parsing is an async long-running operation
+ on Google Cloud and results are stored in a output GCS bucket.
+ """
+ try:
+ from google.cloud import documentai
+ from google.cloud.documentai_v1.types import OcrConfig, ProcessOptions
+ except ImportError:
+ raise ImportError(
+ "documentai package not found, please install it with"
+ " `pip install google-cloud-documentai`"
+ )
+
+ if not self._processor_name:
+ raise ValueError("Processor name is not defined, aborting!")
+ output_path = gcs_output_path if gcs_output_path else self._gcs_output_path
+ if output_path is None:
+ raise ValueError("An output path on GCS should be provided!")
+
+ operations = []
+ for batch in batch_iterate(size=batch_size, iterable=blobs):
+ documents = []
+ for blob in batch:
+ gcs_document = documentai.GcsDocument(
+ gcs_uri=blob.path, mime_type="application/pdf"
+ )
+ documents.append(gcs_document)
+ gcs_documents = documentai.GcsDocuments(documents=documents)
+
+ input_config = documentai.BatchDocumentsInputConfig(
+ gcs_documents=gcs_documents
+ )
+
+ gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
+ gcs_uri=output_path, field_mask=None
+ )
+ output_config = documentai.DocumentOutputConfig(
+ gcs_output_config=gcs_output_config
+ )
+
+ if enable_native_pdf_parsing:
+ process_options = ProcessOptions(
+ ocr_config=OcrConfig(
+ enable_native_pdf_parsing=enable_native_pdf_parsing
+ )
+ )
+ else:
+ process_options = ProcessOptions()
+ request = documentai.BatchProcessRequest(
+ name=self._processor_name,
+ input_documents=input_config,
+ document_output_config=output_config,
+ process_options=process_options,
+ )
+ operations.append(self._client.batch_process_documents(request))
+ return operations
+
+ def get_results(self, operations: List["Operation"]) -> List[DocAIParsingResults]:
+ try:
+ from google.cloud.documentai_v1 import BatchProcessMetadata
+ except ImportError:
+ raise ImportError(
+ "documentai package not found, please install it with"
+ " `pip install google-cloud-documentai`"
+ )
+
+ results = []
+ for op in operations:
+ if isinstance(op.metadata, BatchProcessMetadata):
+ metadata = op.metadata
+ else:
+ metadata = BatchProcessMetadata.deserialize(op.metadata.value)
+ for status in metadata.individual_process_statuses:
+ source = status.input_gcs_source
+ output = status.output_gcs_destination
+ results.append(
+ DocAIParsingResults(source_path=source, parsed_path=output)
+ )
+ return results
diff --git a/libs/langchain/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/langchain/tests/unit_tests/document_loaders/parsers/test_public_api.py
index 84f2db36b..f1037064b 100644
--- a/libs/langchain/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -5,6 +5,7 @@ def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"BS4HTMLParser",
+ "DocAIParser",
"GrobidParser",
"LanguageParser",
"OpenAIWhisperParser",
From ade482c17e07782daf43ad9f0e72a65065578374 Mon Sep 17 00:00:00 2001
From: Harrison Chase
Date: Thu, 24 Aug 2023 21:55:22 -0700
Subject: [PATCH 123/143] add twitter chat loader doc (#9737)
---
...itter-scraper_2023-08-23_22-13-19-740.json | 2635 +++++++++++++++++
.../integrations/chat_loaders/twitter.ipynb | 77 +
2 files changed, 2712 insertions(+)
create mode 100644 docs/extras/integrations/chat_loaders/example_data/dataset_twitter-scraper_2023-08-23_22-13-19-740.json
create mode 100644 docs/extras/integrations/chat_loaders/twitter.ipynb
diff --git a/docs/extras/integrations/chat_loaders/example_data/dataset_twitter-scraper_2023-08-23_22-13-19-740.json b/docs/extras/integrations/chat_loaders/example_data/dataset_twitter-scraper_2023-08-23_22-13-19-740.json
new file mode 100644
index 000000000..9d7ce57b5
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/example_data/dataset_twitter-scraper_2023-08-23_22-13-19-740.json
@@ -0,0 +1,2635 @@
+[{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519480761749016577",
+ "conversation_id": "1519480761749016577",
+ "full_text": "Next I’m buying Coca-Cola to put the cocaine back in",
+ "reply_count": 187291,
+ "retweet_count": 648962,
+ "favorite_count": 4596262,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519480761749016577",
+ "created_at": "2022-04-28T00:56:58.000Z",
+ "quote_count": 171980,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1518623997054918657",
+ "conversation_id": "1518623997054918657",
+ "full_text": "I hope that even my worst critics remain on Twitter, because that is what free speech means",
+ "reply_count": 174468,
+ "retweet_count": 351409,
+ "favorite_count": 3105543,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1518623997054918657",
+ "created_at": "2022-04-25T16:12:30.000Z",
+ "quote_count": 70717,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519495072802390016",
+ "conversation_id": "1519495072802390016",
+ "full_text": "Let’s make Twitter maximum fun!",
+ "reply_count": 110542,
+ "retweet_count": 184310,
+ "favorite_count": 2542681,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519495072802390016",
+ "created_at": "2022-04-28T01:53:50.000Z",
+ "quote_count": 34654,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1518677066325053441",
+ "conversation_id": "1518677066325053441",
+ "full_text": "🚀💫♥️ Yesss!!! ♥️💫🚀 https://t.co/0T9HzUHuh6",
+ "reply_count": 145150,
+ "retweet_count": 330889,
+ "favorite_count": 2505986,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FRNsuSFWUAUW6aP.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1518677066325053441",
+ "created_at": "2022-04-25T19:43:22.000Z",
+ "quote_count": 61362,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519495982723084290",
+ "conversation_id": "1519495982723084290",
+ "full_text": "Listen, I can’t do miracles ok https://t.co/z7dvLMUXy8",
+ "reply_count": 74894,
+ "retweet_count": 202182,
+ "favorite_count": 2473727,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FRZViwWX0AMsqQ1.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1519495982723084290",
+ "created_at": "2022-04-28T01:57:27.000Z",
+ "quote_count": 25525,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1585841080431321088",
+ "conversation_id": "1585841080431321088",
+ "full_text": "the bird is freed",
+ "reply_count": 137792,
+ "retweet_count": 330476,
+ "favorite_count": 2370568,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1585841080431321088",
+ "created_at": "2022-10-28T03:49:11.000Z",
+ "quote_count": 52481,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1586104694421659648",
+ "conversation_id": "1586104694421659648",
+ "full_text": "Comedy is now legal on Twitter",
+ "reply_count": 87869,
+ "retweet_count": 236840,
+ "favorite_count": 2274201,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1586104694421659648",
+ "created_at": "2022-10-28T21:16:42.000Z",
+ "quote_count": 39559,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1593459801966538755",
+ "conversation_id": "1593459801966538755",
+ "full_text": "https://t.co/rbwbsLA1ZG",
+ "reply_count": 67716,
+ "retweet_count": 218714,
+ "favorite_count": 2030318,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fh0bPd7VQAAU31D.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1593459801966538755",
+ "created_at": "2022-11-18T04:23:16.000Z",
+ "quote_count": 165205,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1523465632502906880",
+ "conversation_id": "1523465632502906880",
+ "full_text": "If I die under mysterious circumstances, it’s been nice knowin ya",
+ "reply_count": 142816,
+ "retweet_count": 164576,
+ "favorite_count": 1814691,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1523465632502906880",
+ "created_at": "2022-05-09T00:51:26.000Z",
+ "quote_count": 40125,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587894226695884800",
+ "conversation_id": "1587894226695884800",
+ "full_text": "https://t.co/kGncG7Hs3M",
+ "reply_count": 71915,
+ "retweet_count": 167091,
+ "favorite_count": 1796047,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FglVYVmXkAIWB5w.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1587894226695884800",
+ "created_at": "2022-11-02T19:47:39.000Z",
+ "quote_count": 63550,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1236029449042198528",
+ "conversation_id": "1236029449042198528",
+ "full_text": "The coronavirus panic is dumb",
+ "reply_count": 39483,
+ "retweet_count": 267574,
+ "favorite_count": 1453833,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1236029449042198528",
+ "created_at": "2020-03-06T20:42:39.000Z",
+ "quote_count": 38185,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519020176884305920",
+ "conversation_id": "1519020176884305920",
+ "full_text": "The extreme antibody reaction from those who fear free speech says it all",
+ "reply_count": 76483,
+ "retweet_count": 182948,
+ "favorite_count": 1586156,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519020176884305920",
+ "created_at": "2022-04-26T18:26:46.000Z",
+ "quote_count": 17843,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1685096284275802112",
+ "conversation_id": "1685096284275802112",
+ "full_text": "https://t.co/XEydRiST9D",
+ "reply_count": 129650,
+ "retweet_count": 100921,
+ "favorite_count": 1672309,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F2KqI_ZXUAAGRCD.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1685096284275802112",
+ "created_at": "2023-07-29T01:13:56.000Z",
+ "view_count": 153952239,
+ "quote_count": 65149,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519735033950470144",
+ "conversation_id": "1519735033950470144",
+ "full_text": "https://t.co/Q9OjlJhi7f",
+ "reply_count": 88482,
+ "retweet_count": 199755,
+ "favorite_count": 1498481,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FRcu9TeXEAMjvTM.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1519735033950470144",
+ "created_at": "2022-04-28T17:47:22.000Z",
+ "quote_count": 43234,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519415674111672325",
+ "conversation_id": "1519415674111672325",
+ "full_text": "For Twitter to deserve public trust, it must be politically neutral, which effectively means upsetting the far right and the far left equally",
+ "reply_count": 77096,
+ "retweet_count": 137275,
+ "favorite_count": 1489409,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519415674111672325",
+ "created_at": "2022-04-27T20:38:20.000Z",
+ "quote_count": 28306,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1479236333516165121",
+ "conversation_id": "1479236333516165121",
+ "full_text": "Starlinks with “lasers” deployed to orbit https://t.co/Y1eg9gl7sJ",
+ "reply_count": 12544,
+ "retweet_count": 9849,
+ "favorite_count": 915843,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FIdNmXtVkAEF0Ss.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1479236333516165121",
+ "created_at": "2022-01-06T23:39:59.000Z",
+ "quote_count": 1185,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1670234980776132608",
+ "conversation_id": "1670234980776132608",
+ "full_text": "Oh hi lol https://t.co/pLxkLDu0Qs",
+ "reply_count": 56500,
+ "retweet_count": 125873,
+ "favorite_count": 1546285,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fy3d3Q4XsAAPSAN.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1670234980776132608",
+ "created_at": "2023-06-18T01:00:25.000Z",
+ "view_count": 80352350,
+ "quote_count": 9952,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519469891455234048",
+ "conversation_id": "1519469891455234048",
+ "full_text": "Twitter DMs should have end to end encryption like Signal, so no one can spy on or hack your messages",
+ "reply_count": 40662,
+ "retweet_count": 101908,
+ "favorite_count": 1403300,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519469891455234048",
+ "created_at": "2022-04-28T00:13:47.000Z",
+ "quote_count": 16661,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1625368108461613057",
+ "conversation_id": "1625368108461613057",
+ "full_text": "https://t.co/iZUukCVrl5",
+ "reply_count": 46125,
+ "retweet_count": 80611,
+ "favorite_count": 1436396,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fo53ramacAAigCq.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1625368108461613057",
+ "created_at": "2023-02-14T05:35:29.000Z",
+ "view_count": 177548664,
+ "quote_count": 20288,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1517707521343082496",
+ "conversation_id": "1517707521343082496",
+ "full_text": "in case u need to lose a boner fast https://t.co/fcHiaXKCJi",
+ "reply_count": 68690,
+ "retweet_count": 128285,
+ "favorite_count": 1353674,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FQ_68lnWQAIuYMM.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1517707521343082496",
+ "created_at": "2022-04-23T03:30:45.000Z",
+ "quote_count": 30414,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1531647849599057921",
+ "conversation_id": "1531647849599057921",
+ "full_text": "https://t.co/G83vCrHHJf",
+ "reply_count": 44573,
+ "retweet_count": 128520,
+ "favorite_count": 1349498,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FUGBmevWYAM-V_M.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1531647849599057921",
+ "created_at": "2022-05-31T14:44:38.000Z",
+ "quote_count": 21742,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1594191387519373313",
+ "conversation_id": "1594191387519373313",
+ "full_text": "Twitter is ALIVE",
+ "reply_count": 86483,
+ "retweet_count": 106216,
+ "favorite_count": 1377837,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1594191387519373313",
+ "created_at": "2022-11-20T04:50:20.000Z",
+ "quote_count": 21561,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1667289678612156416",
+ "conversation_id": "1667289678612156416",
+ "full_text": "https://t.co/g9gS4MUIVL",
+ "reply_count": 37176,
+ "retweet_count": 178234,
+ "favorite_count": 1383503,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FyNnICoaUAEE9Xv.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1667289678612156416",
+ "created_at": "2023-06-09T21:56:50.000Z",
+ "view_count": 87691832,
+ "quote_count": 13850,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1585341984679469056",
+ "conversation_id": "1585341984679469056",
+ "full_text": "Entering Twitter HQ – let that sink in! https://t.co/D68z4K2wq7",
+ "reply_count": 65658,
+ "retweet_count": 172649,
+ "favorite_count": 1331840,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/ext_tw_video_thumb/1585341912877146112/pu/img/DwJ7wlGIe9iryk6N.jpg",
+ "type": "video",
+ "video_url": "https://video.twimg.com/ext_tw_video/1585341912877146112/pu/vid/1920x1080/aeoVUvTgj4wHShhN.mp4?tag=14"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1585341984679469056",
+ "created_at": "2022-10-26T18:45:58.000Z",
+ "view_count": 48607062,
+ "quote_count": 42432,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1521202951230046210",
+ "conversation_id": "1521202951230046210",
+ "full_text": "As I was saying … https://t.co/tsGz6fCWuW",
+ "reply_count": 52348,
+ "retweet_count": 97069,
+ "favorite_count": 1273109,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FRxmBNeXwAMVTOg.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1521202951230046210",
+ "created_at": "2022-05-02T19:00:20.000Z",
+ "quote_count": 9562,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1588750686006947840",
+ "conversation_id": "1588750686006947840",
+ "full_text": "Trash me all day, but it’ll cost $8",
+ "reply_count": 104121,
+ "retweet_count": 89130,
+ "favorite_count": 1300597,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1588750686006947840",
+ "created_at": "2022-11-05T04:30:55.000Z",
+ "quote_count": 28347,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1053390822991790083",
+ "conversation_id": "1053390822991790083",
+ "full_text": "Had to been done ur welcome https://t.co/7jT0f9lqIS",
+ "reply_count": 15864,
+ "retweet_count": 322101,
+ "favorite_count": 1102018,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Dp5lXiYUUAAngKq.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1053390822991790083",
+ "created_at": "2018-10-19T21:01:57.000Z",
+ "quote_count": 24554,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1590755506112823296",
+ "conversation_id": "1590755506112823296",
+ "full_text": "I love when people complain about Twitter … on Twitter 🤣🤣",
+ "reply_count": 78232,
+ "retweet_count": 101956,
+ "favorite_count": 1268496,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1590755506112823296",
+ "created_at": "2022-11-10T17:17:22.000Z",
+ "quote_count": 28192,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1601894132573605888",
+ "conversation_id": "1601894132573605888",
+ "full_text": "My pronouns are Prosecute/Fauci",
+ "reply_count": 110524,
+ "retweet_count": 180237,
+ "favorite_count": 1231419,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1601894132573605888",
+ "created_at": "2022-12-11T10:58:17.000Z",
+ "quote_count": 34573,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1625695877326340102",
+ "conversation_id": "1625695877326340102",
+ "full_text": "The new CEO of Twitter is amazing https://t.co/yBqWFUDIQH",
+ "reply_count": 42614,
+ "retweet_count": 87988,
+ "favorite_count": 1240934,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fo-hx39aIAABMKW.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1625695877326340102",
+ "created_at": "2023-02-15T03:17:55.000Z",
+ "view_count": 140213026,
+ "quote_count": 14042,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1601624795585486848",
+ "conversation_id": "1601624795585486848",
+ "full_text": "🇲🇦🇲🇦 Congrats Morocco!! 🇲🇦🇲🇦",
+ "reply_count": 32666,
+ "retweet_count": 132562,
+ "favorite_count": 1219545,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1601624795585486848",
+ "created_at": "2022-12-10T17:08:02.000Z",
+ "quote_count": 12875,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1607997591870124032",
+ "conversation_id": "1607997591870124032",
+ "full_text": "I’m not brainwashed!! https://t.co/4kx61uu4yy",
+ "reply_count": 71411,
+ "retweet_count": 151179,
+ "favorite_count": 1188252,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FlDBSYAXgAAlX8i.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1607997591870124032",
+ "created_at": "2022-12-28T07:11:15.000Z",
+ "view_count": 130353016,
+ "quote_count": 39375,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1503287788652871680",
+ "conversation_id": "1503287788652871680",
+ "full_text": "https://t.co/Gw6xaw1u0N",
+ "reply_count": 30439,
+ "retweet_count": 137446,
+ "favorite_count": 1141043,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FNzARriXsAMoSur.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1503287788652871680",
+ "created_at": "2022-03-14T08:31:53.000Z",
+ "quote_count": 24278,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1514720245113577473",
+ "conversation_id": "1514720245113577473",
+ "full_text": "i♥️u",
+ "reply_count": 84542,
+ "retweet_count": 79423,
+ "favorite_count": 1139093,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1514720245113577473",
+ "created_at": "2022-04-14T21:40:23.000Z",
+ "quote_count": 15750,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1520017094007476224",
+ "conversation_id": "1520017094007476224",
+ "full_text": "The far left hates everyone, themselves included!",
+ "reply_count": 59438,
+ "retweet_count": 114302,
+ "favorite_count": 1128704,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1520017094007476224",
+ "created_at": "2022-04-29T12:28:10.000Z",
+ "quote_count": 16725,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1530380264966434823",
+ "conversation_id": "1530380264966434823",
+ "full_text": "https://t.co/USLO967YsJ",
+ "reply_count": 35224,
+ "retweet_count": 92904,
+ "favorite_count": 1146612,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FT0AuTZWAAEgQHU.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1530380264966434823",
+ "created_at": "2022-05-28T02:47:42.000Z",
+ "quote_count": 8393,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1523658010241155073",
+ "conversation_id": "1523658010241155073",
+ "full_text": "Chocolate milk is insanely good. Just had some.",
+ "reply_count": 68723,
+ "retweet_count": 71066,
+ "favorite_count": 1090118,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1523658010241155073",
+ "created_at": "2022-05-09T13:35:52.000Z",
+ "quote_count": 14543,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1511982702819520512",
+ "conversation_id": "1511982702819520512",
+ "full_text": "https://t.co/TW2lLQakE5",
+ "reply_count": 31984,
+ "retweet_count": 103366,
+ "favorite_count": 1088637,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FPukQSkXEAAtaQb.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1511982702819520512",
+ "created_at": "2022-04-07T08:22:22.000Z",
+ "quote_count": 11547,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1688485935816581120",
+ "conversation_id": "1688485935816581120",
+ "full_text": "https://t.co/hDSTKPdQnG",
+ "reply_count": 29410,
+ "retweet_count": 67082,
+ "favorite_count": 1176975,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F261AH-WUAAuyrT.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1688485935816581120",
+ "created_at": "2023-08-07T09:43:12.000Z",
+ "view_count": 66638614,
+ "quote_count": 4715,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1546344529460174849",
+ "conversation_id": "1546344529460174849",
+ "full_text": "https://t.co/JcLMee61wj",
+ "reply_count": 36837,
+ "retweet_count": 120770,
+ "favorite_count": 1088822,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FXW4J4xXgAAXFKs.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1546344529460174849",
+ "created_at": "2022-07-11T04:04:00.000Z",
+ "quote_count": 16457,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1686050455468621831",
+ "conversation_id": "1686050455468621831",
+ "full_text": "I ♥️ Canada https://t.co/95321VIi8r",
+ "reply_count": 73993,
+ "retweet_count": 102243,
+ "favorite_count": 1156524,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F2YN81pXMAAjF1e.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1686050455468621831",
+ "created_at": "2023-07-31T16:25:28.000Z",
+ "view_count": 130861294,
+ "quote_count": 51439,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1520650036865949696",
+ "conversation_id": "1520650036865949696",
+ "full_text": "Since I’ve been asked a lot:\n\nBuy stock in several companies that make products & services that *you* believe in.\n\nOnly sell if you think their products & services are trending worse. Don’t panic when the market does.\n\nThis will serve you well in the long-term.",
+ "reply_count": 42044,
+ "retweet_count": 104025,
+ "favorite_count": 1056890,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1520650036865949696",
+ "created_at": "2022-05-01T06:23:15.000Z",
+ "quote_count": 9050,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587297137099931649",
+ "conversation_id": "1587297137099931649",
+ "full_text": "Halloween with my Mom https://t.co/xOAgNeeiNN",
+ "reply_count": 36632,
+ "retweet_count": 44968,
+ "favorite_count": 1073096,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fgc2U1AXkAA99Ay.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1587297137099931649",
+ "created_at": "2022-11-01T04:15:02.000Z",
+ "quote_count": 6724,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1388693126206918658",
+ "conversation_id": "1388693126206918658",
+ "full_text": "I love Art Deco",
+ "reply_count": 25352,
+ "retweet_count": 11784,
+ "favorite_count": 758325,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1388693126206918658",
+ "created_at": "2021-05-02T03:13:36.000Z",
+ "quote_count": 3020,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1685384125836849153",
+ "conversation_id": "1685384125836849153",
+ "full_text": "https://t.co/5YdlVQifRn",
+ "reply_count": 38306,
+ "retweet_count": 56775,
+ "favorite_count": 1120840,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F2Ov7dOWcAAylqk.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1685384125836849153",
+ "created_at": "2023-07-29T20:17:43.000Z",
+ "view_count": 77682952,
+ "quote_count": 10273,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1524883482836623373",
+ "conversation_id": "1524883482836623373",
+ "full_text": "Biden’s mistake is that he thinks he was elected to transform the country, but actually everyone just wanted less drama",
+ "reply_count": 58436,
+ "retweet_count": 88667,
+ "favorite_count": 1035375,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1524883482836623373",
+ "created_at": "2022-05-12T22:45:27.000Z",
+ "quote_count": 13152,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587627120355934208",
+ "conversation_id": "1587627120355934208",
+ "full_text": "To all complainers, please continue complaining, but it will cost $8",
+ "reply_count": 77567,
+ "retweet_count": 80134,
+ "favorite_count": 1051207,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1587627120355934208",
+ "created_at": "2022-11-02T02:06:16.000Z",
+ "quote_count": 29667,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1594500655724609536",
+ "conversation_id": "1594500655724609536",
+ "full_text": "And lead us not into temptation … https://t.co/8qNOXzwXS9",
+ "reply_count": 67690,
+ "retweet_count": 86645,
+ "favorite_count": 1041472,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FiDN441XEAEq5lc.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1594500655724609536",
+ "created_at": "2022-11-21T01:19:15.000Z",
+ "quote_count": 26030,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587911540770222081",
+ "conversation_id": "1587647032457449473",
+ "full_text": "@AOC Your feedback is appreciated, now pay $8",
+ "reply_count": 43326,
+ "retweet_count": 75097,
+ "favorite_count": 1028618,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [
+ {
+ "id_str": "138203134",
+ "name": "Alexandria Ocasio-Cortez",
+ "screen_name": "AOC",
+ "profile": "https://twitter.com/AOC"
+ }
+ ],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1587911540770222081",
+ "created_at": "2022-11-02T20:56:27.000Z",
+ "quote_count": 17086,
+ "is_quote_tweet": false,
+ "replying_to_tweet": "https://twitter.com/AOC/status/1587647032457449473",
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1625377144137461761",
+ "conversation_id": "1625377144137461761",
+ "full_text": "There are no coincidences https://t.co/92Ny452J9B",
+ "reply_count": 25034,
+ "retweet_count": 87106,
+ "favorite_count": 1034478,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fo5_5eWaMAMEPaf.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1625377144137461761",
+ "created_at": "2023-02-14T06:11:23.000Z",
+ "view_count": 115414305,
+ "quote_count": 11381,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1608828315581976576",
+ "conversation_id": "1608828315581976576",
+ "full_text": "https://t.co/v1rrSsdwdg",
+ "reply_count": 56860,
+ "retweet_count": 137588,
+ "favorite_count": 984592,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FlO00p-aYAE5h8J.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1608828315581976576",
+ "created_at": "2022-12-30T14:12:15.000Z",
+ "view_count": 88845731,
+ "quote_count": 15482,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587297730631696384",
+ "conversation_id": "1587297730631696384",
+ "full_text": "😉 https://t.co/eaIYaDRBnu",
+ "reply_count": 33970,
+ "retweet_count": 67582,
+ "favorite_count": 969459,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fgc23kFXkAEJOas.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1587297730631696384",
+ "created_at": "2022-11-01T04:17:24.000Z",
+ "quote_count": 7822,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1357236825589432322",
+ "conversation_id": "1357236825589432322",
+ "full_text": "ur welcome https://t.co/e2KF57KLxb",
+ "reply_count": 21387,
+ "retweet_count": 129294,
+ "favorite_count": 906408,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/EtXfpgGWYAEIa7y.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1357236825589432322",
+ "created_at": "2021-02-04T07:57:30.000Z",
+ "quote_count": 18283,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1595207476936413187",
+ "conversation_id": "1595207476936413187",
+ "full_text": "Wasn’t Twitter supposed to die by now or something … ?",
+ "reply_count": 65614,
+ "retweet_count": 64613,
+ "favorite_count": 932862,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1595207476936413187",
+ "created_at": "2022-11-23T00:07:54.000Z",
+ "quote_count": 12541,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1585966869122457600",
+ "conversation_id": "1585966869122457600",
+ "full_text": "🎶 let the good times roll 🎶",
+ "reply_count": 44899,
+ "retweet_count": 82648,
+ "favorite_count": 942793,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1585966869122457600",
+ "created_at": "2022-10-28T12:09:02.000Z",
+ "quote_count": 7638,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1604650028999405568",
+ "conversation_id": "1604650028999405568",
+ "full_text": "Those who want power are the ones who least deserve it",
+ "reply_count": 89824,
+ "retweet_count": 92573,
+ "favorite_count": 945701,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1604650028999405568",
+ "created_at": "2022-12-19T01:29:14.000Z",
+ "view_count": 103726233,
+ "quote_count": 26405,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1594131768298315777",
+ "conversation_id": "1594131768298315777",
+ "full_text": "The people have spoken. \n\nTrump will be reinstated.\n\nVox Populi, Vox Dei.",
+ "reply_count": 128348,
+ "retweet_count": 118481,
+ "favorite_count": 913495,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1594131768298315777",
+ "created_at": "2022-11-20T00:53:25.000Z",
+ "quote_count": 39411,
+ "is_quote_tweet": true,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "quoted_tweet": {
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1593767953706921985",
+ "conversation_id": "1593767953706921985",
+ "full_text": "Reinstate former President Trump",
+ "reply_count": 210109,
+ "retweet_count": 213875,
+ "favorite_count": 794653,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1593767953706921985",
+ "created_at": "2022-11-19T00:47:45.000Z",
+ "#sort_index": "1694472769204387754",
+ "quote_count": 75130,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false
+ },
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1593767953706921985",
+ "conversation_id": "1593767953706921985",
+ "full_text": "Reinstate former President Trump",
+ "reply_count": 210109,
+ "retweet_count": 213875,
+ "favorite_count": 794653,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1593767953706921985",
+ "created_at": "2022-11-19T00:47:45.000Z",
+ "quote_count": 75130,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1587129795732770824",
+ "conversation_id": "1587129795732770824",
+ "full_text": "If I had a dollar for every time someone asked me if Trump is coming back on this platform, Twitter would be minting money!",
+ "reply_count": 69021,
+ "retweet_count": 55275,
+ "favorite_count": 907907,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1587129795732770824",
+ "created_at": "2022-10-31T17:10:05.000Z",
+ "quote_count": 6873,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1511011921495011328",
+ "conversation_id": "1511011921495011328",
+ "full_text": "Oh hi lol",
+ "reply_count": 64749,
+ "retweet_count": 50654,
+ "favorite_count": 884878,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1511011921495011328",
+ "created_at": "2022-04-04T16:04:49.000Z",
+ "quote_count": 10090,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1683378289031761920",
+ "conversation_id": "1683378289031761920",
+ "full_text": "Our headquarters tonight https://t.co/GO6yY8R7fO",
+ "reply_count": 48576,
+ "retweet_count": 74788,
+ "favorite_count": 943571,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F1yPk5VXoAA3rGZ.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1683378289031761920",
+ "created_at": "2023-07-24T07:27:14.000Z",
+ "view_count": 110928423,
+ "quote_count": 20062,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1347978218494513152",
+ "conversation_id": "1347978218494513152",
+ "full_text": "My 14-year-old son, Saxon, said he feels like 2021 will be a good year. I agree. Let us all make it so.",
+ "reply_count": 26148,
+ "retweet_count": 57425,
+ "favorite_count": 841624,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1347978218494513152",
+ "created_at": "2021-01-09T18:47:06.000Z",
+ "quote_count": 10610,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1641858340752875535",
+ "conversation_id": "1641858340752875535",
+ "full_text": "https://t.co/qviPxhX7n8",
+ "reply_count": 49591,
+ "retweet_count": 41093,
+ "favorite_count": 918203,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FskNdX5WYAkmETe.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1641858340752875535",
+ "created_at": "2023-03-31T17:41:47.000Z",
+ "view_count": 77587609,
+ "quote_count": 10737,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1514564966564651008",
+ "conversation_id": "1514564966564651008",
+ "full_text": "I made an offer \nhttps://t.co/VvreuPMeLu",
+ "reply_count": 75744,
+ "retweet_count": 101232,
+ "favorite_count": 864685,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [
+ {
+ "url": "https://t.co/VvreuPMeLu",
+ "expanded_url": "https://www.sec.gov/Archives/edgar/data/0001418091/000110465922045641/tm2212748d1_sc13da.htm",
+ "display_url": "sec.gov/Archives/edgar…"
+ }
+ ],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1514564966564651008",
+ "created_at": "2022-04-14T11:23:21.000Z",
+ "quote_count": 30137,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1597405399040217088",
+ "conversation_id": "1597405399040217088",
+ "full_text": "This is a battle for the future of civilization. If free speech is lost even in America, tyranny is all that lies ahead.",
+ "reply_count": 82874,
+ "retweet_count": 136750,
+ "favorite_count": 878560,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1597405399040217088",
+ "created_at": "2022-11-29T01:41:40.000Z",
+ "quote_count": 16765,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1609254628113420290",
+ "conversation_id": "1609254628113420290",
+ "full_text": "Sometimes it’s just better to make pizza at home",
+ "reply_count": 51068,
+ "retweet_count": 42308,
+ "favorite_count": 804635,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1609254628113420290",
+ "created_at": "2022-12-31T18:26:16.000Z",
+ "view_count": 81906590,
+ "quote_count": 7397,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1349286488618491904",
+ "conversation_id": "1349286488618491904",
+ "full_text": "Legalize comedy",
+ "reply_count": 16426,
+ "retweet_count": 75729,
+ "favorite_count": 804936,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1349286488618491904",
+ "created_at": "2021-01-13T09:25:42.000Z",
+ "quote_count": 8341,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1618371072486936578",
+ "conversation_id": "1618371072486936578",
+ "full_text": "Changed my name to Mr. Tweet, now Twitter won’t let me change it back 🤣",
+ "reply_count": 61260,
+ "retweet_count": 53130,
+ "favorite_count": 882792,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1618371072486936578",
+ "created_at": "2023-01-25T22:11:46.000Z",
+ "view_count": 82457457,
+ "quote_count": 11770,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1642962756906418176",
+ "conversation_id": "1642962756906418176",
+ "full_text": "https://t.co/wmN5WxUhfQ",
+ "reply_count": 31801,
+ "retweet_count": 80651,
+ "favorite_count": 887233,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fsz562paMAAB_nP.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1642962756906418176",
+ "created_at": "2023-04-03T18:50:20.000Z",
+ "view_count": 76577794,
+ "quote_count": 16600,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1680423042873278465",
+ "conversation_id": "1680423042873278465",
+ "full_text": "https://t.co/LCXD4QPsNW",
+ "reply_count": 21342,
+ "retweet_count": 79920,
+ "favorite_count": 901835,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F1IP2Z9WYAA-AR0.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1680423042873278465",
+ "created_at": "2023-07-16T03:44:08.000Z",
+ "view_count": 95252197,
+ "quote_count": 8094,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1374617643446063105",
+ "conversation_id": "1374617643446063105",
+ "full_text": "You can now buy a Tesla with Bitcoin",
+ "reply_count": 32923,
+ "retweet_count": 100977,
+ "favorite_count": 816205,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1374617643446063105",
+ "created_at": "2021-03-24T07:02:40.000Z",
+ "quote_count": 21989,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1666964082363371520",
+ "conversation_id": "1666964082363371520",
+ "full_text": "https://t.co/kf7VYDgOra",
+ "reply_count": 21723,
+ "retweet_count": 102762,
+ "favorite_count": 874592,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FyI-_vraEAEfW6O.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1666964082363371520",
+ "created_at": "2023-06-09T00:23:02.000Z",
+ "view_count": 75132462,
+ "quote_count": 8403,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1266811094527508481",
+ "conversation_id": "1266811094527508481",
+ "full_text": "5 mins to T-0",
+ "reply_count": 15420,
+ "retweet_count": 44012,
+ "favorite_count": 766204,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1266811094527508481",
+ "created_at": "2020-05-30T19:17:55.000Z",
+ "quote_count": 3961,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1597165510595989504",
+ "conversation_id": "1597165510595989504",
+ "full_text": "My bedside table https://t.co/sIdRYJcLTK",
+ "reply_count": 85849,
+ "retweet_count": 53759,
+ "favorite_count": 851738,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FipFkIsVsAAM0O_.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1597165510595989504",
+ "created_at": "2022-11-28T09:48:26.000Z",
+ "quote_count": 28524,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1517215066550116354",
+ "conversation_id": "1517215066550116354",
+ "full_text": "If our twitter bid succeeds, we will defeat the spam bots or die trying!",
+ "reply_count": 32433,
+ "retweet_count": 69710,
+ "favorite_count": 833196,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1517215066550116354",
+ "created_at": "2022-04-21T18:53:55.000Z",
+ "quote_count": 12602,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1276396101872922625",
+ "conversation_id": "1276396101872922625",
+ "full_text": "https://t.co/e9dPKVSjjl",
+ "reply_count": 5811,
+ "retweet_count": 116826,
+ "favorite_count": 777176,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/EbarfO6U4AA-7c5.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1276396101872922625",
+ "created_at": "2020-06-26T06:05:19.000Z",
+ "quote_count": 7287,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1559691922725281800",
+ "conversation_id": "1559690651687608321",
+ "full_text": "Also, I’m buying Manchester United ur welcome",
+ "reply_count": 54593,
+ "retweet_count": 112817,
+ "favorite_count": 847886,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1559691922725281800",
+ "created_at": "2022-08-17T00:01:46.000Z",
+ "quote_count": 52581,
+ "is_quote_tweet": false,
+ "replying_to_tweet": "https://twitter.com/elonmusk/status/1559690651687608321",
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1519036983137509376",
+ "conversation_id": "1519020176884305920",
+ "full_text": "By “free speech”, I simply mean that which matches the law. \n\nI am against censorship that goes far beyond the law. \n\nIf people want less free speech, they will ask government to pass laws to that effect.\n\nTherefore, going beyond the law is contrary to the will of the people.",
+ "reply_count": 58411,
+ "retweet_count": 85586,
+ "favorite_count": 804143,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1519036983137509376",
+ "created_at": "2022-04-26T19:33:33.000Z",
+ "quote_count": 16481,
+ "is_quote_tweet": false,
+ "replying_to_tweet": "https://twitter.com/elonmusk/status/1519020176884305920",
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1526997132858822658",
+ "conversation_id": "1526997132858822658",
+ "full_text": "In the past I voted Democrat, because they were (mostly) the kindness party.\n\nBut they have become the party of division & hate, so I can no longer support them and will vote Republican.\n\nNow, watch their dirty tricks campaign against me unfold … 🍿",
+ "reply_count": 106371,
+ "retweet_count": 119882,
+ "favorite_count": 804876,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1526997132858822658",
+ "created_at": "2022-05-18T18:44:21.000Z",
+ "quote_count": 32796,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1688022163574439937",
+ "conversation_id": "1688022163574439937",
+ "full_text": "If you were unfairly treated by your employer due to posting or liking something on this platform, we will fund your legal bill.\n\nNo limit. \n\nPlease let us know.",
+ "reply_count": 46614,
+ "retweet_count": 142357,
+ "favorite_count": 867887,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1688022163574439937",
+ "created_at": "2023-08-06T03:00:20.000Z",
+ "view_count": 137457648,
+ "quote_count": 27757,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1432818021836357634",
+ "conversation_id": "1432818021836357634",
+ "full_text": "https://t.co/YUt6Ltz2B6",
+ "reply_count": 6742,
+ "retweet_count": 13738,
+ "favorite_count": 515958,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/E-JkZaKVIAcbTdW.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1432818021836357634",
+ "created_at": "2021-08-31T21:30:11.000Z",
+ "quote_count": 788,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1649052609590992901",
+ "conversation_id": "1649052609590992901",
+ "full_text": "https://t.co/vX3M7B3J1G",
+ "reply_count": 40115,
+ "retweet_count": 74183,
+ "favorite_count": 838276,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/ext_tw_video_thumb/1649047801446400000/pu/img/e2X_U3_Ti1mhf0fD.jpg",
+ "type": "video",
+ "video_url": "https://video.twimg.com/ext_tw_video/1649047801446400000/pu/vid/540x634/iek2j2lOnDvsuctV.mp4?tag=12"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1649052609590992901",
+ "created_at": "2023-04-20T14:09:14.000Z",
+ "view_count": 95752230,
+ "quote_count": 9531,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1597336812732575744",
+ "conversation_id": "1597336812732575744",
+ "full_text": "The Twitter Files on free speech suppression soon to be published on Twitter itself. The public deserves to know what really happened …",
+ "reply_count": 45806,
+ "retweet_count": 122369,
+ "favorite_count": 810965,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1597336812732575744",
+ "created_at": "2022-11-28T21:09:07.000Z",
+ "quote_count": 14868,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1361252063926251521",
+ "conversation_id": "1361252063926251521",
+ "full_text": "https://t.co/w11m1IAG0z",
+ "reply_count": 10498,
+ "retweet_count": 68594,
+ "favorite_count": 756594,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/EuQjiWeXAAEYUts.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1361252063926251521",
+ "created_at": "2021-02-15T09:52:37.000Z",
+ "quote_count": 5088,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1672582593638957056",
+ "conversation_id": "1672582593638957056",
+ "full_text": "Don’t even trust nobody https://t.co/VHa1zVGI71",
+ "reply_count": 22047,
+ "retweet_count": 72713,
+ "favorite_count": 832630,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FzY0_SvaIAAb9Xr.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1672582593638957056",
+ "created_at": "2023-06-24T12:29:00.000Z",
+ "view_count": 69581275,
+ "quote_count": 4599,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1686058966705487875",
+ "conversation_id": "1686058966705487875",
+ "full_text": "Wow, I’m glad so many people love Canada too 🤗 https://t.co/5oOL05zawB",
+ "reply_count": 35449,
+ "retweet_count": 44669,
+ "favorite_count": 846460,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F2YVsVIXwBMdxRO.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1686058966705487875",
+ "created_at": "2023-07-31T16:59:17.000Z",
+ "view_count": 65689425,
+ "quote_count": 8190,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1662654838398697472",
+ "conversation_id": "1662654838398697472",
+ "full_text": "Sorry this app takes up so much space https://t.co/bCCfcOhNJt",
+ "reply_count": 48357,
+ "retweet_count": 63774,
+ "favorite_count": 825999,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FxLvvm1XoAEkCaK.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1662654838398697472",
+ "created_at": "2023-05-28T02:59:38.000Z",
+ "view_count": 104719593,
+ "quote_count": 14271,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1525305145239781377",
+ "conversation_id": "1525305145239781377",
+ "full_text": "The bots are angry at being counted 🤣",
+ "reply_count": 33782,
+ "retweet_count": 52651,
+ "favorite_count": 778264,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1525305145239781377",
+ "created_at": "2022-05-14T02:41:00.000Z",
+ "quote_count": 5713,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1520021098934554624",
+ "conversation_id": "1520017094007476224",
+ "full_text": "But I’m no fan of the far right either. \n\nLet’s have less hate and more love.",
+ "reply_count": 40853,
+ "retweet_count": 44229,
+ "favorite_count": 771649,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1520021098934554624",
+ "created_at": "2022-04-29T12:44:05.000Z",
+ "quote_count": 6144,
+ "is_quote_tweet": false,
+ "replying_to_tweet": "https://twitter.com/elonmusk/status/1520017094007476224",
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1607590239874211847",
+ "conversation_id": "1607590239874211847",
+ "full_text": "Some nights … https://t.co/BLAUsJr4wb",
+ "reply_count": 41223,
+ "retweet_count": 47150,
+ "favorite_count": 777302,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fk9Oy_iWIAEx8Qd.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1607590239874211847",
+ "created_at": "2022-12-27T04:12:35.000Z",
+ "view_count": 74451033,
+ "quote_count": 10569,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1518614732839735304",
+ "conversation_id": "1518614732839735304",
+ "full_text": "And be my love in the rain",
+ "reply_count": 38409,
+ "retweet_count": 49713,
+ "favorite_count": 766115,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1518614732839735304",
+ "created_at": "2022-04-25T15:35:41.000Z",
+ "quote_count": 6798,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1497701484003213317",
+ "conversation_id": "1497543633293266944",
+ "full_text": "@FedorovMykhailo Starlink service is now active in Ukraine. More terminals en route.",
+ "reply_count": 26104,
+ "retweet_count": 127689,
+ "favorite_count": 769963,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [
+ {
+ "id_str": "1331528215899344896",
+ "name": "Mykhailo Fedorov",
+ "screen_name": "FedorovMykhailo",
+ "profile": "https://twitter.com/FedorovMykhailo"
+ }
+ ],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1497701484003213317",
+ "created_at": "2022-02-26T22:33:54.000Z",
+ "quote_count": 23725,
+ "is_quote_tweet": false,
+ "replying_to_tweet": "https://twitter.com/FedorovMykhailo/status/1497543633293266944",
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1505100708256825347",
+ "conversation_id": "1505100708256825347",
+ "full_text": "https://t.co/qZSX2up9W0",
+ "reply_count": 25006,
+ "retweet_count": 63264,
+ "favorite_count": 752723,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FOMxHZwXEAIreox.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1505100708256825347",
+ "created_at": "2022-03-19T08:35:46.000Z",
+ "quote_count": 4722,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1658960642445910017",
+ "conversation_id": "1658960642445910017",
+ "full_text": "https://t.co/FxOptt5Rgb",
+ "reply_count": 30576,
+ "retweet_count": 54781,
+ "favorite_count": 792357,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/FwXP5iKWcAEecKA.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1658960642445910017",
+ "created_at": "2023-05-17T22:20:13.000Z",
+ "view_count": 66684167,
+ "quote_count": 5027,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1358319935978496001",
+ "conversation_id": "1358319935978496001",
+ "full_text": "So … it’s finally come to this … https://t.co/Gf0Rg2QOaF",
+ "reply_count": 27511,
+ "retweet_count": 83980,
+ "favorite_count": 720596,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Etm4yFZUcAAoN5u.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1358319935978496001",
+ "created_at": "2021-02-07T07:41:23.000Z",
+ "quote_count": 7616,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1129274835173908481",
+ "conversation_id": "1129274835173908481",
+ "full_text": "And I am forever grateful https://t.co/kU1pT8t0yv",
+ "reply_count": 2904,
+ "retweet_count": 117714,
+ "favorite_count": 667576,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/D6v9ed6UwAAoKg2.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1129274835173908481",
+ "created_at": "2019-05-17T06:37:56.000Z",
+ "quote_count": 3819,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1647629006089461761",
+ "conversation_id": "1647629006089461761",
+ "full_text": "Launch attempt tomorrow https://t.co/czFsQ53Xsa",
+ "reply_count": 26620,
+ "retweet_count": 51862,
+ "favorite_count": 783936,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Ft2N2IxX0AkIbLf.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1647629006089461761",
+ "created_at": "2023-04-16T15:52:21.000Z",
+ "view_count": 75845428,
+ "quote_count": 4885,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1375033483148451842",
+ "conversation_id": "1375033483148451842",
+ "full_text": "If there’s ever a scandal about me, *please* call it Elongate",
+ "reply_count": 20973,
+ "retweet_count": 53774,
+ "favorite_count": 723756,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [],
+ "url": "https://twitter.com/elonmusk/status/1375033483148451842",
+ "created_at": "2021-03-25T10:35:03.000Z",
+ "quote_count": 8794,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1677470862436450308",
+ "conversation_id": "1677470862436450308",
+ "full_text": "Just drove Cybertruck around Austin! https://t.co/QN19Agqa7R",
+ "reply_count": 49034,
+ "retweet_count": 48651,
+ "favorite_count": 792030,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/F0eS2dyXgAAIqng.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1677470862436450308",
+ "created_at": "2023-07-08T00:13:14.000Z",
+ "view_count": 75117791,
+ "quote_count": 7125,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+},
+{
+ "username": "elonmusk",
+ "user_id": "44196397",
+ "id": "1629598417159692288",
+ "conversation_id": "1629598417159692288",
+ "full_text": "https://t.co/5wIbOXFs1e",
+ "reply_count": 14937,
+ "retweet_count": 67980,
+ "favorite_count": 774099,
+ "hashtags": [],
+ "symbols": [],
+ "user_mentions": [],
+ "urls": [],
+ "media": [
+ {
+ "media_url": "https://pbs.twimg.com/media/Fp1_H34WwAI3n1j.jpg",
+ "type": "photo"
+ }
+ ],
+ "url": "https://twitter.com/elonmusk/status/1629598417159692288",
+ "created_at": "2023-02-25T21:45:13.000Z",
+ "view_count": 96013117,
+ "quote_count": 6278,
+ "is_quote_tweet": false,
+ "is_retweet": false,
+ "is_pinned": false,
+ "is_truncated": false,
+ "startUrl": "https://twitter.com/elonmusk/with_replies"
+}]
\ No newline at end of file
diff --git a/docs/extras/integrations/chat_loaders/twitter.ipynb b/docs/extras/integrations/chat_loaders/twitter.ipynb
new file mode 100644
index 000000000..61dc650b8
--- /dev/null
+++ b/docs/extras/integrations/chat_loaders/twitter.ipynb
@@ -0,0 +1,77 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "d86853d2",
+ "metadata": {},
+ "source": [
+ "# Twitter (via Apify)\n",
+ "\n",
+ "This notebook shows how to load chat messages from Twitter to finetune on. We do this by utilizing Apify. \n",
+ "\n",
+ "First, use Apify to export tweets. An example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e5034b4e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "from langchain.schema import AIMessage\n",
+ "from langchain.adapters.openai import convert_message_to_dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8bf0fb93",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('example_data/dataset_twitter-scraper_2023-08-23_22-13-19-740.json') as f:\n",
+ " data = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "468124fa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filter out tweets that reference other tweets, because it's a bit weird\n",
+ "tweets = [d[\"full_text\"] for d in data if \"t.co\" not in d['full_text']]\n",
+ "# Create them as AI messages\n",
+ "messages = [AIMessage(content=t) for t in tweets]\n",
+ "# Add in a system message at the start\n",
+ "# TODO: we could try to extract the subject from the tweets, and put that in the system message.\n",
+ "system_message = {\"role\": \"system\", \"content\": \"write a tweet\"}\n",
+ "data = [[system_message, convert_message_to_dict(m)] for m in messages]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 30151c99c72903320d7d865a9d5bf49237a4f267 Mon Sep 17 00:00:00 2001
From: Margaret Qian
Date: Thu, 24 Aug 2023 22:13:17 -0700
Subject: [PATCH 124/143] Update Mosaic endpoint input/output api (#7391)
As noted in prior PRs (https://github.com/hwchase17/langchain/pull/6060,
https://github.com/hwchase17/langchain/pull/7348), the input/output
format has changed a few times as we've stabilized our inference API.
This PR updates the API to the latest stable version as indicated in our
docs: https://docs.mosaicml.com/en/latest/inference.html
The input format looks like this:
`{"inputs": []}
`
The output format looks like this:
`
{"outputs": []}
`
---------
Co-authored-by: Bagatur
---
docs/extras/integrations/llms/mosaicml.ipynb | 2 +-
.../langchain/embeddings/mosaicml.py | 37 ++++++-------------
libs/langchain/langchain/llms/mosaicml.py | 32 +++++-----------
.../embeddings/test_mosaicml.py | 4 +-
.../integration_tests/llms/test_mosaicml.py | 20 ++++++----
5 files changed, 36 insertions(+), 59 deletions(-)
diff --git a/docs/extras/integrations/llms/mosaicml.ipynb b/docs/extras/integrations/llms/mosaicml.ipynb
index 596ee2d7b..cd9be156f 100644
--- a/docs/extras/integrations/llms/mosaicml.ipynb
+++ b/docs/extras/integrations/llms/mosaicml.ipynb
@@ -63,7 +63,7 @@
"metadata": {},
"outputs": [],
"source": [
- "llm = MosaicML(inject_instruction_format=True, model_kwargs={\"do_sample\": False})"
+ "llm = MosaicML(inject_instruction_format=True, model_kwargs={\"max_new_tokens\": 128})"
]
},
{
diff --git a/libs/langchain/langchain/embeddings/mosaicml.py b/libs/langchain/langchain/embeddings/mosaicml.py
index 8346bf7cf..6a3c3e11c 100644
--- a/libs/langchain/langchain/embeddings/mosaicml.py
+++ b/libs/langchain/langchain/embeddings/mosaicml.py
@@ -79,14 +79,8 @@ class MosaicMLInstructorEmbeddings(BaseModel, Embeddings):
raise ValueError(f"Error raised by inference endpoint: {e}")
try:
- parsed_response = response.json()
-
- if "error" in parsed_response:
- # if we get rate limited, try sleeping for 1 second
- if (
- not is_retry
- and "rate limit exceeded" in parsed_response["error"].lower()
- ):
+ if response.status_code == 429:
+ if not is_retry:
import time
time.sleep(self.retry_sleep)
@@ -94,16 +88,20 @@ class MosaicMLInstructorEmbeddings(BaseModel, Embeddings):
return self._embed(input, is_retry=True)
raise ValueError(
- f"Error raised by inference API: {parsed_response['error']}"
+ f"Error raised by inference API: rate limit exceeded.\nResponse: "
+ f"{response.text}"
)
+ parsed_response = response.json()
+
# The inference API has changed a couple of times, so we add some handling
# to be robust to multiple response formats.
if isinstance(parsed_response, dict):
- if "data" in parsed_response:
- output_item = parsed_response["data"]
- elif "output" in parsed_response:
- output_item = parsed_response["output"]
+ output_keys = ["data", "output", "outputs"]
+ for key in output_keys:
+ if key in parsed_response:
+ output_item = parsed_response[key]
+ break
else:
raise ValueError(
f"No key data or output in response: {parsed_response}"
@@ -113,19 +111,6 @@ class MosaicMLInstructorEmbeddings(BaseModel, Embeddings):
embeddings = output_item
else:
embeddings = [output_item]
- elif isinstance(parsed_response, list):
- first_item = parsed_response[0]
- if isinstance(first_item, list):
- embeddings = parsed_response
- elif isinstance(first_item, dict):
- if "output" in first_item:
- embeddings = [item["output"] for item in parsed_response]
- else:
- raise ValueError(
- f"No key data or output in response: {parsed_response}"
- )
- else:
- raise ValueError(f"Unexpected response format: {parsed_response}")
else:
raise ValueError(f"Unexpected response type: {parsed_response}")
diff --git a/libs/langchain/langchain/llms/mosaicml.py b/libs/langchain/langchain/llms/mosaicml.py
index 780e7a8b4..718466178 100644
--- a/libs/langchain/langchain/llms/mosaicml.py
+++ b/libs/langchain/langchain/llms/mosaicml.py
@@ -138,14 +138,8 @@ class MosaicML(LLM):
raise ValueError(f"Error raised by inference endpoint: {e}")
try:
- parsed_response = response.json()
-
- if "error" in parsed_response:
- # if we get rate limited, try sleeping for 1 second
- if (
- not is_retry
- and "rate limit exceeded" in parsed_response["error"].lower()
- ):
+ if response.status_code == 429:
+ if not is_retry:
import time
time.sleep(self.retry_sleep)
@@ -153,9 +147,12 @@ class MosaicML(LLM):
return self._call(prompt, stop, run_manager, is_retry=True)
raise ValueError(
- f"Error raised by inference API: {parsed_response['error']}"
+ f"Error raised by inference API: rate limit exceeded.\nResponse: "
+ f"{response.text}"
)
+ parsed_response = response.json()
+
# The inference API has changed a couple of times, so we add some handling
# to be robust to multiple response formats.
if isinstance(parsed_response, dict):
@@ -173,23 +170,12 @@ class MosaicML(LLM):
text = output_item[0]
else:
text = output_item
- elif isinstance(parsed_response, list):
- first_item = parsed_response[0]
- if isinstance(first_item, str):
- text = first_item
- elif isinstance(first_item, dict):
- if "output" in parsed_response:
- text = first_item["output"]
- else:
- raise ValueError(
- f"No key data or output in response: {parsed_response}"
- )
- else:
- raise ValueError(f"Unexpected response format: {parsed_response}")
else:
raise ValueError(f"Unexpected response type: {parsed_response}")
- text = text[len(prompt) :]
+ # Older versions of the API include the input in the output response
+ if text.startswith(prompt):
+ text = text[len(prompt) :]
except requests.exceptions.JSONDecodeError as e:
raise ValueError(
diff --git a/libs/langchain/tests/integration_tests/embeddings/test_mosaicml.py b/libs/langchain/tests/integration_tests/embeddings/test_mosaicml.py
index a04c6f2c1..ae0bec3dd 100644
--- a/libs/langchain/tests/integration_tests/embeddings/test_mosaicml.py
+++ b/libs/langchain/tests/integration_tests/embeddings/test_mosaicml.py
@@ -34,7 +34,9 @@ def test_mosaicml_embedding_endpoint() -> None:
"""Test MosaicML embeddings with a different endpoint"""
documents = ["foo bar"]
embedding = MosaicMLInstructorEmbeddings(
- endpoint_url="https://models.hosted-on.mosaicml.hosting/instructor-xl/v1/predict"
+ endpoint_url=(
+ "https://models.hosted-on.mosaicml.hosting/instructor-xl/v1/predict"
+ )
)
output = embedding.embed_documents(documents)
assert len(output) == 1
diff --git a/libs/langchain/tests/integration_tests/llms/test_mosaicml.py b/libs/langchain/tests/integration_tests/llms/test_mosaicml.py
index 2b532ab66..e15fce0fe 100644
--- a/libs/langchain/tests/integration_tests/llms/test_mosaicml.py
+++ b/libs/langchain/tests/integration_tests/llms/test_mosaicml.py
@@ -1,4 +1,6 @@
"""Test MosaicML API wrapper."""
+import re
+
import pytest
from langchain.llms.mosaicml import PROMPT_FOR_GENERATION_FORMAT, MosaicML
@@ -13,7 +15,7 @@ def test_mosaicml_llm_call() -> None:
def test_mosaicml_endpoint_change() -> None:
"""Test valid call to MosaicML."""
- new_url = "https://models.hosted-on.mosaicml.hosting/dolly-12b/v1/predict"
+ new_url = "https://models.hosted-on.mosaicml.hosting/mpt-30b-instruct/v1/predict"
llm = MosaicML(endpoint_url=new_url)
assert llm.endpoint_url == new_url
output = llm("Say foo:")
@@ -34,7 +36,7 @@ def test_mosaicml_extra_kwargs() -> None:
def test_instruct_prompt() -> None:
"""Test instruct prompt."""
- llm = MosaicML(inject_instruction_format=True, model_kwargs={"do_sample": False})
+ llm = MosaicML(inject_instruction_format=True, model_kwargs={"max_new_tokens": 10})
instruction = "Repeat the word foo"
prompt = llm._transform_prompt(instruction)
expected_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)
@@ -45,7 +47,7 @@ def test_instruct_prompt() -> None:
def test_retry_logic() -> None:
"""Tests that two queries (which would usually exceed the rate limit) works"""
- llm = MosaicML(inject_instruction_format=True, model_kwargs={"do_sample": False})
+ llm = MosaicML(inject_instruction_format=True, model_kwargs={"max_new_tokens": 10})
instruction = "Repeat the word foo"
prompt = llm._transform_prompt(instruction)
expected_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)
@@ -70,9 +72,11 @@ def test_short_retry_does_not_loop() -> None:
with pytest.raises(
ValueError,
- match="Error raised by inference API: Rate limit exceeded: 1 per 1 second",
+ match=re.escape(
+ "Error raised by inference API: rate limit exceeded.\nResponse: You have "
+ "reached maximum request limit.\n"
+ ),
):
- output = llm(prompt)
- assert isinstance(output, str)
- output = llm(prompt)
- assert isinstance(output, str)
+ for _ in range(10):
+ output = llm(prompt)
+ assert isinstance(output, str)
From d04fe0d3ea55fb8cac418050da1bf3e205a2ac18 Mon Sep 17 00:00:00 2001
From: Jurik-001 <123458520+Jurik-001@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:18:55 +0200
Subject: [PATCH 125/143] =?UTF-8?q?remove=20Value=20error=20"pyspark=20is?=
=?UTF-8?q?=20not=20installed.=20Please=20install=20it=20with=20`pip=20i?=
=?UTF-8?q?=E2=80=A6=20(#9723)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Description: You cannot execute spark_sql with versions prior to 3.4 due
to the introduction of pyspark.errors in version 3.4.
And if you are below you get 3.4 "pyspark is not installed. Please
install it with pip nstall pyspark" which is not helpful. Also if you
not have pyspark installed you get already the error in init. I would
return all errors. But if you have a different idea feel free to
comment.
Issue: None
Dependencies: None
Maintainer:
---------
Co-authored-by: Bagatur
---
libs/langchain/langchain/utilities/spark_sql.py | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/libs/langchain/langchain/utilities/spark_sql.py b/libs/langchain/langchain/utilities/spark_sql.py
index ffecbe511..20c1e8e5b 100644
--- a/libs/langchain/langchain/utilities/spark_sql.py
+++ b/libs/langchain/langchain/utilities/spark_sql.py
@@ -179,14 +179,8 @@ class SparkSQL:
If the statement throws an error, the error message is returned.
"""
- try:
- from pyspark.errors import PySparkException
- except ImportError:
- raise ValueError(
- "pyspark is not installed. Please install it with `pip install pyspark`"
- )
try:
return self.run(command, fetch)
- except PySparkException as e:
+ except Exception as e:
"""Format the error message"""
return f"Error: {e}"
From 135cb862152068a8efb8dc61af740012b1be8940 Mon Sep 17 00:00:00 2001
From: Sergey Kozlov
Date: Fri, 25 Aug 2023 14:47:17 +0600
Subject: [PATCH 126/143] Fix QuestionListOutputParser (#9738)
This PR fixes `QuestionListOutputParser` text splitting.
`QuestionListOutputParser` incorrectly splits numbered list text into
lines. If text doesn't end with `\n` , the regex doesn't capture the
last item. So it always returns `n - 1` items, and
`WebResearchRetriever.llm_chain` generates less queries than requested
in the search prompt.
How to reproduce:
```python
from langchain.retrievers.web_research import QuestionListOutputParser
parser = QuestionListOutputParser()
good = parser.parse(
"""1. This is line one.
2. This is line two.
""" # <-- !
)
bad = parser.parse(
"""1. This is line one.
2. This is line two.""" # <-- No new line.
)
assert good.lines == ['1. This is line one.\n', '2. This is line two.\n'], good.lines
assert bad.lines == ['1. This is line one.\n', '2. This is line two.'], bad.lines
```
NOTE: Last item will not contain a line break but this seems ok because
the items are stripped in the
`WebResearchRetriever.clean_search_query()`.
---
.../langchain/retrievers/web_research.py | 2 +-
.../retrievers/test_web_research.py | 36 +++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
create mode 100644 libs/langchain/tests/unit_tests/retrievers/test_web_research.py
diff --git a/libs/langchain/langchain/retrievers/web_research.py b/libs/langchain/langchain/retrievers/web_research.py
index f51dbe8b8..30d9c04cb 100644
--- a/libs/langchain/langchain/retrievers/web_research.py
+++ b/libs/langchain/langchain/retrievers/web_research.py
@@ -61,7 +61,7 @@ class QuestionListOutputParser(PydanticOutputParser):
super().__init__(pydantic_object=LineList)
def parse(self, text: str) -> LineList:
- lines = re.findall(r"\d+\..*?\n", text)
+ lines = re.findall(r"\d+\..*?(?:\n|$)", text)
return LineList(lines=lines)
diff --git a/libs/langchain/tests/unit_tests/retrievers/test_web_research.py b/libs/langchain/tests/unit_tests/retrievers/test_web_research.py
new file mode 100644
index 000000000..a052e59b7
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/retrievers/test_web_research.py
@@ -0,0 +1,36 @@
+from typing import List
+
+import pytest
+
+from langchain.retrievers.web_research import QuestionListOutputParser
+
+
+@pytest.mark.parametrize(
+ "text,expected",
+ (
+ (
+ "1. Line one.\n",
+ ["1. Line one.\n"],
+ ),
+ (
+ "1. Line one.",
+ ["1. Line one."],
+ ),
+ (
+ "1. Line one.\n2. Line two.\n",
+ ["1. Line one.\n", "2. Line two.\n"],
+ ),
+ (
+ "1. Line one.\n2. Line two.",
+ ["1. Line one.\n", "2. Line two."],
+ ),
+ (
+ "1. Line one.\n2. Line two.\n3. Line three.",
+ ["1. Line one.\n", "2. Line two.\n", "3. Line three."],
+ ),
+ ),
+)
+def test_list_output_parser(text: str, expected: List[str]) -> None:
+ parser = QuestionListOutputParser()
+ result = parser.parse(text)
+ assert result.lines == expected
From cacaf487c38785520b40c9efc53a556ddb6dc3d0 Mon Sep 17 00:00:00 2001
From: Fabrizio Ruocco
Date: Fri, 25 Aug 2023 11:34:09 +0200
Subject: [PATCH 127/143] Azure Cognitive Search - update sdk b8, mod user
agent, search with scores (#9191)
Description: Update Azure Cognitive Search SDK to version b8 (breaking
change)
Customizable User Agent.
Implemented Similarity search with scores
@baskaryan
---------
Co-authored-by: Bagatur
---
.../vectorstores/azuresearch.ipynb | 71 ++++++++++++++-----
.../langchain/vectorstores/azuresearch.py | 66 +++++++++++++----
libs/langchain/poetry.lock | 8 +--
libs/langchain/pyproject.toml | 2 +-
4 files changed, 111 insertions(+), 36 deletions(-)
diff --git a/docs/extras/integrations/vectorstores/azuresearch.ipynb b/docs/extras/integrations/vectorstores/azuresearch.ipynb
index fe6462136..fc9bb75b5 100644
--- a/docs/extras/integrations/vectorstores/azuresearch.ipynb
+++ b/docs/extras/integrations/vectorstores/azuresearch.ipynb
@@ -6,7 +6,9 @@
"source": [
"# Azure Cognitive Search\n",
"\n",
- "[Azure Cognitive Search](https://learn.microsoft.com/azure/search/search-what-is-azure-search) (formerly known as `Azure Search`) is a cloud search service that gives developers infrastructure, APIs, and tools for building a rich search experience over private, heterogeneous content in web, mobile, and enterprise applications.\n"
+ "[Azure Cognitive Search](https://learn.microsoft.com/azure/search/search-what-is-azure-search) (formerly known as `Azure Search`) is a cloud search service that gives developers infrastructure, APIs, and tools for building a rich search experience over private, heterogeneous content in web, mobile, and enterprise applications.\n",
+ "\n",
+ "Vector search is currently in public preview. It's available through the Azure portal, preview REST API and beta client libraries. [More info](https://learn.microsoft.com/en-us/azure/search/vector-search-overview) Beta client libraries are subject to potential breaking changes, please be sure to use the SDK package version identified below. azure-search-documents==11.4.0b8"
]
},
{
@@ -22,7 +24,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!pip install azure-search-documents==11.4.0b6\n",
+ "!pip install azure-search-documents==11.4.0b8\n",
"!pip install azure-identity"
]
},
@@ -36,13 +38,13 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"import os\n",
- "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+ "from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores.azuresearch import AzureSearch"
]
},
@@ -57,7 +59,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -79,7 +81,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -98,7 +100,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -151,7 +153,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -178,6 +180,41 @@
"print(docs[0].page_content)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Perform a vector similarity search with relevance scores\n",
+ " \n",
+ "Execute a pure vector similarity search using the similarity_search_with_relevance_scores() method:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': 'C:\\\\repos\\\\langchain-fruocco-acs\\\\langchain\\\\docs\\\\extras\\\\modules\\\\state_of_the_union.txt'}),\n",
+ " 0.8441472),\n",
+ " (Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': 'C:\\\\repos\\\\langchain-fruocco-acs\\\\langchain\\\\docs\\\\extras\\\\modules\\\\state_of_the_union.txt'}),\n",
+ " 0.8441472),\n",
+ " (Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': 'C:\\\\repos\\\\langchain-fruocco-acs\\\\langchain\\\\docs\\\\extras\\\\modules\\\\state_of_the_union.txt'}),\n",
+ " 0.82153815),\n",
+ " (Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': 'C:\\\\repos\\\\langchain-fruocco-acs\\\\langchain\\\\docs\\\\extras\\\\modules\\\\state_of_the_union.txt'}),\n",
+ " 0.82153815)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "docs_and_scores = vector_store.similarity_search_with_relevance_scores(query=\"What did the president say about Ketanji Brown Jackson\", k=4, score_threshold=0.80)\n",
+ "from pprint import pprint\n",
+ "pprint(docs_and_scores)"
+ ]
+ },
{
"attachments": {},
"cell_type": "markdown",
@@ -190,7 +227,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -219,7 +256,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -254,7 +291,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -328,7 +365,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -348,7 +385,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
@@ -371,7 +408,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
@@ -400,7 +437,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -494,7 +531,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -530,7 +567,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
diff --git a/libs/langchain/langchain/vectorstores/azuresearch.py b/libs/langchain/langchain/vectorstores/azuresearch.py
index 32b6d03f1..2fad466ad 100644
--- a/libs/langchain/langchain/vectorstores/azuresearch.py
+++ b/libs/langchain/langchain/vectorstores/azuresearch.py
@@ -73,6 +73,7 @@ def _get_search_client(
scoring_profiles: Optional[List[ScoringProfile]] = None,
default_scoring_profile: Optional[str] = None,
default_fields: Optional[List[SearchField]] = None,
+ user_agent: Optional[str] = "langchain",
) -> SearchClient:
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceNotFoundError
@@ -80,13 +81,13 @@ def _get_search_client(
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
+ HnswVectorSearchAlgorithmConfiguration,
PrioritizedFields,
SearchIndex,
SemanticConfiguration,
SemanticField,
SemanticSettings,
VectorSearch,
- VectorSearchAlgorithmConfiguration,
)
default_fields = default_fields or []
@@ -95,7 +96,7 @@ def _get_search_client(
else:
credential = AzureKeyCredential(key)
index_client: SearchIndexClient = SearchIndexClient(
- endpoint=endpoint, credential=credential, user_agent="langchain"
+ endpoint=endpoint, credential=credential, user_agent=user_agent
)
try:
index_client.get_index(name=index_name)
@@ -130,10 +131,10 @@ def _get_search_client(
if vector_search is None:
vector_search = VectorSearch(
algorithm_configurations=[
- VectorSearchAlgorithmConfiguration(
+ HnswVectorSearchAlgorithmConfiguration(
name="default",
kind="hnsw",
- hnsw_parameters={ # type: ignore
+ parameters={ # type: ignore
"m": 4,
"efConstruction": 400,
"efSearch": 500,
@@ -171,7 +172,7 @@ def _get_search_client(
endpoint=endpoint,
index_name=index_name,
credential=credential,
- user_agent="langchain",
+ user_agent=user_agent,
)
@@ -227,6 +228,9 @@ class AzureSearch(VectorStore):
type=SearchFieldDataType.String,
),
]
+ user_agent = "langchain"
+ if "user_agent" in kwargs and kwargs["user_agent"]:
+ user_agent += " " + kwargs["user_agent"]
self.client = _get_search_client(
azure_search_endpoint,
azure_search_key,
@@ -238,6 +242,7 @@ class AzureSearch(VectorStore):
scoring_profiles=scoring_profiles,
default_scoring_profile=default_scoring_profile,
default_fields=default_fields,
+ user_agent=user_agent,
)
self.search_type = search_type
self.semantic_configuration_name = semantic_configuration_name
@@ -321,6 +326,17 @@ class AzureSearch(VectorStore):
raise ValueError(f"search_type of {search_type} not allowed.")
return docs
+ def similarity_search_with_relevance_scores(
+ self, query: str, k: int = 4, **kwargs: Any
+ ) -> List[Tuple[Document, float]]:
+ score_threshold = kwargs.pop("score_threshold", None)
+ result = self.vector_search_with_score(query, k=k, **kwargs)
+ return (
+ result
+ if score_threshold is None
+ else [r for r in result if r[1] >= score_threshold]
+ )
+
def vector_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
"""
Returns the most similar indexed documents to the query text.
@@ -349,12 +365,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
+ from azure.search.documents.models import Vector
results = self.client.search(
search_text="",
- vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
- top_k=k,
- vector_fields=FIELDS_CONTENT_VECTOR,
+ vectors=[
+ Vector(
+ value=np.array(
+ self.embedding_function(query), dtype=np.float32
+ ).tolist(),
+ k=k,
+ fields=FIELDS_CONTENT_VECTOR,
+ )
+ ],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
)
@@ -399,12 +422,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
+ from azure.search.documents.models import Vector
results = self.client.search(
search_text=query,
- vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
- top_k=k,
- vector_fields=FIELDS_CONTENT_VECTOR,
+ vectors=[
+ Vector(
+ value=np.array(
+ self.embedding_function(query), dtype=np.float32
+ ).tolist(),
+ k=k,
+ fields=FIELDS_CONTENT_VECTOR,
+ )
+ ],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
top=k,
@@ -452,11 +482,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
+ from azure.search.documents.models import Vector
+
results = self.client.search(
search_text=query,
- vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
- top_k=50, # Hardcoded value to maximize L2 retrieval
- vector_fields=FIELDS_CONTENT_VECTOR,
+ vectors=[
+ Vector(
+ value=np.array(
+ self.embedding_function(query), dtype=np.float32
+ ).tolist(),
+ k=50,
+ fields=FIELDS_CONTENT_VECTOR,
+ )
+ ],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
query_type="semantic",
diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock
index badf9ec89..e21acf426 100644
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@@ -719,13 +719,13 @@ msal-extensions = ">=0.3.0,<2.0.0"
[[package]]
name = "azure-search-documents"
-version = "11.4.0b6"
+version = "11.4.0b8"
description = "Microsoft Azure Cognitive Search Client Library for Python"
optional = true
python-versions = ">=3.7"
files = [
- {file = "azure-search-documents-11.4.0b6.zip", hash = "sha256:c9ebd7d99d3c7b879f48acad66141e1f50eae4468cfb8389a4b25d4c620e8df1"},
- {file = "azure_search_documents-11.4.0b6-py3-none-any.whl", hash = "sha256:24ff85bf2680c36b38d8092bcbbe2d90699aac7c4a228b0839c0ce595a41628c"},
+ {file = "azure-search-documents-11.4.0b8.zip", hash = "sha256:b178ff52918590191a9cb7f411a9ab3cb517663666a501a3e84b715d19b0d93b"},
+ {file = "azure_search_documents-11.4.0b8-py3-none-any.whl", hash = "sha256:4137daa2db75bff9484d394c16c0604822a51281cad2f50e11d7c48dd8d4b4cf"},
]
[package.dependencies]
@@ -10447,4 +10447,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
-content-hash = "88e479307b19d991105360780f67ed3258ef1a0151f70b9e91c86c8153751e83"
+content-hash = "43a6bd42efc0baf917418087f788aaf3b1bc793cb4aa81de99c52ed6a7d54d26"
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index bc626b415..bab5c36c1 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -105,7 +105,7 @@ nebula3-python = {version = "^3.4.0", optional = true}
mwparserfromhell = {version = "^0.6.4", optional = true}
mwxml = {version = "^0.3.3", optional = true}
awadb = {version = "^0.3.9", optional = true}
-azure-search-documents = {version = "11.4.0b6", optional = true}
+azure-search-documents = {version = "11.4.0b8", optional = true}
esprima = {version = "^4.0.1", optional = true}
streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || >3.9.7,<4.0"}
psychicapi = {version = "^0.8.0", optional = true}
From 9731ce5a406d5a7bb1878a54b265a6f7c728effc Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Fri, 25 Aug 2023 03:05:04 -0700
Subject: [PATCH 128/143] bump 273 (#9751)
---
libs/experimental/pyproject.toml | 2 +-
libs/langchain/pyproject.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/libs/experimental/pyproject.toml b/libs/experimental/pyproject.toml
index 461aceedf..a90522199 100644
--- a/libs/experimental/pyproject.toml
+++ b/libs/experimental/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-experimental"
-version = "0.0.10"
+version = "0.0.11"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index bab5c36c1..63597a4be 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain"
-version = "0.0.272"
+version = "0.0.273"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"
From 709a67d9bfcff475356924d8461140052dd418f7 Mon Sep 17 00:00:00 2001
From: Harrison Chase
Date: Fri, 25 Aug 2023 07:07:27 -0700
Subject: [PATCH 129/143] multivector notebook (#9740)
---
.../retrievers/multi_vector.ipynb | 257 ++++++++++++++++--
1 file changed, 235 insertions(+), 22 deletions(-)
diff --git a/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
index 4a7587562..9197b5168 100644
--- a/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
+++ b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb
@@ -13,7 +13,10 @@
"\n",
"- smaller chunks: split a document into smaller chunks, and embed those (this is ParentDocumentRetriever)\n",
"- summary: create a summary for each document, embed that along with (or instead of) the document\n",
- "- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document"
+ "- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document\n",
+ "\n",
+ "\n",
+ "Note that this also enables another method of adding embeddings - manually. This is great because you can explicitly add questions or queries that should lead to a document being recovered, giving you more control"
]
},
{
@@ -106,7 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "5d23247d",
"metadata": {},
"outputs": [],
@@ -122,7 +125,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "92ed5861",
"metadata": {},
"outputs": [],
@@ -133,17 +136,17 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"id": "8afed60c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': 'b4ca7817-e3fe-4103-ac81-574fb41439ef', 'source': '../../state_of_the_union.txt'})"
+ "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '10e9cbc0-4ba5-4d79-a09b-c033d1ba7b01', 'source': '../../state_of_the_union.txt'})"
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -155,7 +158,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"id": "3c9017f1",
"metadata": {},
"outputs": [
@@ -165,7 +168,7 @@
"9874"
]
},
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -187,7 +190,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 10,
"id": "1433dff4",
"metadata": {},
"outputs": [],
@@ -201,7 +204,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"id": "35b30390",
"metadata": {},
"outputs": [],
@@ -216,17 +219,17 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 12,
"id": "41a2a738",
"metadata": {},
"outputs": [],
"source": [
- "summaries = [chain.invoke(d) for d in docs]"
+ "summaries = chain.batch(docs, {\"max_concurrency\": 5})"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 13,
"id": "7ac5e4b1",
"metadata": {},
"outputs": [],
@@ -250,7 +253,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 15,
"id": "0d93309f",
"metadata": {},
"outputs": [],
@@ -260,7 +263,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 16,
"id": "6d5edf0d",
"metadata": {},
"outputs": [],
@@ -271,7 +274,20 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 17,
+ "id": "862ae920",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # We can also add the original chunks to the vectorstore if we so want\n",
+ "# for i, doc in enumerate(docs):\n",
+ "# doc.metadata[id_key] = doc_ids[i]\n",
+ "# retriever.vectorstore.add_documents(docs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
"id": "299232d6",
"metadata": {},
"outputs": [],
@@ -281,17 +297,17 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 19,
"id": "10e404c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Document(page_content='The document discusses various topics and proposals put forth by the President in a State of the Union address. These include the nomination of a judge for the Supreme Court, securing the border and fixing the immigration system, advancing liberty and justice for women and LGBTQ+ individuals, passing bipartisan legislation, addressing the opioid epidemic and mental health issues, supporting veterans, and ending cancer. The President expresses optimism about the future of the country and emphasizes the strength of the American people.', metadata={'doc_id': '8c7a707d-615d-42d5-919d-bc5178dd1ae4'})"
+ "Document(page_content=\"The document is a transcript of a speech given by the President of the United States. The President discusses several important issues and initiatives, including the nomination of a Supreme Court Justice, border security and immigration reform, protecting women's rights, advancing LGBTQ+ equality, bipartisan legislation, addressing the opioid epidemic and mental health, supporting veterans, investigating the health effects of burn pits on military personnel, ending cancer, and the strength and resilience of the American people.\", metadata={'doc_id': '79fa2e9f-28d9-4372-8af3-2caf4f1de312'})"
]
},
- "execution_count": 20,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -302,7 +318,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 20,
"id": "e4cce5c2",
"metadata": {},
"outputs": [],
@@ -312,7 +328,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 21,
"id": "c8570dbb",
"metadata": {},
"outputs": [
@@ -322,7 +338,7 @@
"9194"
]
},
- "execution_count": 24,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -340,6 +356,203 @@
"\n",
"An LLM can also be used to generate a list of hypothetical questions that could be asked of a particular document. These questions can then be embedded"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "5219b085",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "functions = [\n",
+ " {\n",
+ " \"name\": \"hypothetical_questions\",\n",
+ " \"description\": \"Generate hypothetical questions\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"questions\": {\n",
+ " \"type\": \"array\",\n",
+ " \"items\": {\n",
+ " \"type\": \"string\"\n",
+ " },\n",
+ " },\n",
+ " },\n",
+ " \"required\": [\"questions\"]\n",
+ " }\n",
+ " }\n",
+ " ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "523deb92",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n",
+ "chain = (\n",
+ " {\"doc\": lambda x: x.page_content}\n",
+ " # Only asking for 3 hypothetical questions, but this could be adjusted\n",
+ " | ChatPromptTemplate.from_template(\"Generate a list of 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\")\n",
+ " | ChatOpenAI(max_retries=0, model=\"gpt-4\").bind(functions=functions, function_call={\"name\": \"hypothetical_questions\"})\n",
+ " | JsonKeyOutputFunctionsParser(key_name=\"questions\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "11d30554",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[\"What was the author's initial impression of philosophy as a field of study, and how did it change when they got to college?\",\n",
+ " 'Why did the author decide to switch their focus to Artificial Intelligence (AI)?',\n",
+ " \"What led to the author's disillusionment with the field of AI as it was practiced at the time?\"]"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chain.invoke(docs[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "3eb2e48c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hypothetical_questions = chain.batch(docs, {\"max_concurrency\": 5})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "b2cd6e75",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The vectorstore to use to index the child chunks\n",
+ "vectorstore = Chroma(\n",
+ " collection_name=\"hypo-questions\",\n",
+ " embedding_function=OpenAIEmbeddings()\n",
+ ")\n",
+ "# The storage layer for the parent documents\n",
+ "store = InMemoryStore()\n",
+ "id_key = \"doc_id\"\n",
+ "# The retriever (empty to start)\n",
+ "retriever = MultiVectorRetriever(\n",
+ " vectorstore=vectorstore, \n",
+ " docstore=store, \n",
+ " id_key=id_key,\n",
+ ")\n",
+ "doc_ids = [str(uuid.uuid4()) for _ in docs]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "18831b3b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "question_docs = []\n",
+ "for i, question_list in enumerate(hypothetical_questions):\n",
+ " question_docs.extend([Document(page_content=s,metadata={id_key: doc_ids[i]}) for s in question_list])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "224b24c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "retriever.vectorstore.add_documents(question_docs)\n",
+ "retriever.docstore.mset(list(zip(doc_ids, docs)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "7b442b90",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sub_docs = vectorstore.similarity_search(\"justice breyer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "089b5ad0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '505d73e3-8350-46ec-a58e-3af032f04ab3'}),\n",
+ " Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '1c9618f0-7660-4b4f-a37c-509cbbbf6dba'}),\n",
+ " Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'}),\n",
+ " Document(page_content='What measures is the President proposing to protect the rights of LGBTQ+ Americans?', metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'})]"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sub_docs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "7594b24e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "retrieved_docs = retriever.get_relevant_documents(\"justice breyer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "4c120c65",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9194"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(retrieved_docs[0].page_content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "616cfeeb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
From 985873c49715d3b03eeede245f25eec500470339 Mon Sep 17 00:00:00 2001
From: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Date: Fri, 25 Aug 2023 11:27:27 -0700
Subject: [PATCH 130/143] Update RAG use case (move to ntbk) (#9340)
---
.../how_to/local_retrieval_qa.ipynb | 7 +-
.../use_cases/question_answering/index.mdx | 342 ---------
.../question_answering.ipynb | 686 ++++++++++++++++++
3 files changed, 689 insertions(+), 346 deletions(-)
delete mode 100644 docs/extras/use_cases/question_answering/index.mdx
create mode 100644 docs/extras/use_cases/question_answering/question_answering.ipynb
diff --git a/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb b/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb
index 9eea135a6..d01b0bb3e 100644
--- a/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb
+++ b/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb
@@ -25,8 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
- "! pip install gpt4all\n",
- "! pip install chromadb"
+ "pip install gpt4all chromadb"
]
},
{
@@ -157,7 +156,7 @@
"metadata": {},
"outputs": [],
"source": [
- "! pip install llama-cpp-python"
+ "pip install llama-cpp-python"
]
},
{
@@ -736,7 +735,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.3"
+ "version": "3.9.16"
}
},
"nbformat": 4,
diff --git a/docs/extras/use_cases/question_answering/index.mdx b/docs/extras/use_cases/question_answering/index.mdx
deleted file mode 100644
index a9d98d264..000000000
--- a/docs/extras/use_cases/question_answering/index.mdx
+++ /dev/null
@@ -1,342 +0,0 @@
----
-sidebar_position: -1
----
-
-# QA over Documents
-
-## Use case
-Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. LLMs, given their proficiency in understanding text, are a great tool for this.
-
-In this walkthrough we'll go over how to build a question-answering over documents application using LLMs. Two very related use cases which we cover elsewhere are:
-- [QA over structured data](/docs/use_cases/tabular) (e.g., SQL)
-- [QA over code](/docs/use_cases/code) (e.g., Python)
-
-
-
-## Overview
-The pipeline for converting raw unstructured data into a QA chain looks like this:
-1. `Loading`: First we need to load our data. Unstructured data can be loaded from many sources. Use the [LangChain integration hub](https://integrations.langchain.com/) to browse the full set of loaders.
-Each loader returns data as a LangChain [`Document`](https://docs.langchain.com/docs/components/schema/document).
-2. `Splitting`: [Text splitters](/docs/modules/data_connection/document_transformers/) break `Documents` into splits of specified size
-3. `Storage`: Storage (e.g., often a [vectorstore](/docs/modules/data_connection/vectorstores/)) will house [and often embed](https://www.pinecone.io/learn/vector-embeddings/) the splits
-4. `Retrieval`: The app retrieves splits from storage (e.g., often [with similar embeddings](https://www.pinecone.io/learn/k-nearest-neighbor/) to the input question)
-5. `Generation`: An [LLM](/docs/modules/model_io/models/llms/) produces an answer using a prompt that includes the question and the retrieved data
-6. `Conversation` (Extension): Hold a multi-turn conversation by adding [Memory](/docs/modules/memory/) to your QA chain.
-
-
-
-## Quickstart
-To give you a sneak preview, the above pipeline can be all be wrapped in a single object: `VectorstoreIndexCreator`. Suppose we want a QA app over this [blog post](https://lilianweng.github.io/posts/2023-06-23-agent/). We can create this in a few lines of code:
-
-First set environment variables and install packages:
-```bash
-pip install openai chromadb
-export OPENAI_API_KEY="..."
-```
-
-Then run:
-```python
-from langchain.document_loaders import WebBaseLoader
-from langchain.indexes import VectorstoreIndexCreator
-
-loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
-index = VectorstoreIndexCreator().from_loaders([loader])
-```
-
-And now ask your questions:
-```python
-index.query("What is Task Decomposition?")
-```
-
- ' Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be done using LLM with simple prompting, task-specific instructions, or human inputs. Tree of Thoughts (Yao et al. 2023) is an example of a task decomposition technique that explores multiple reasoning possibilities at each step and generates multiple thoughts per step, creating a tree structure.'
-
-Ok, but what's going on under the hood, and how could we customize this for our specific use case? For that, let's take a look at how we can construct this pipeline piece by piece.
-
-## Step 1. Load
-
-Specify a `DocumentLoader` to load in your unstructured data as `Documents`. A `Document` is a piece of text (the `page_content`) and associated metadata.
-
-```python
-from langchain.document_loaders import WebBaseLoader
-
-loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
-data = loader.load()
-```
-
-### Go deeper
-- Browse the > 120 data loader integrations [here](https://integrations.langchain.com/).
-- See further documentation on loaders [here](/docs/modules/data_connection/document_loaders/).
-
-## Step 2. Split
-
-Split the `Document` into chunks for embedding and vector storage.
-
-```python
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-
-text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
-all_splits = text_splitter.split_documents(data)
-```
-
-### Go deeper
-
-- `DocumentSplitters` are just one type of the more generic `DocumentTransformers`, which can all be useful in this preprocessing step.
-- See further documentation on transformers [here](/docs/modules/data_connection/document_transformers/).
-- `Context-aware splitters` keep the location ("context") of each split in the original `Document`:
- - [Markdown files](/docs/use_cases/question_answering/document-context-aware-QA)
- - [Code (py or js)](/docs/modules/data_connection/document_loaders/integrations/source_code)
- - [Documents](/docs/modules/data_connection/document_loaders/integrations/grobid)
-
-## Step 3. Store
-
-To be able to look up our document splits, we first need to store them where we can later look them up.
-The most common way to do this is to embed the contents of each document then store the embedding and document in a vector store, with the embedding being used to index the document.
-
-```python
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.vectorstores import Chroma
-
-vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
-```
-
-### Go deeper
-- Browse the > 40 vectorstores integrations [here](https://integrations.langchain.com/).
-- See further documentation on vectorstores [here](/docs/modules/data_connection/vectorstores/).
-- Browse the > 30 text embedding integrations [here](https://integrations.langchain.com/).
-- See further documentation on embedding models [here](/docs/modules/data_connection/text_embedding/).
-
- Here are Steps 1-3:
-
-
-
-## Step 4. Retrieve
-
-Retrieve relevant splits for any question using [similarity search](https://www.pinecone.io/learn/what-is-similarity-search/).
-
-```python
-question = "What are the approaches to Task Decomposition?"
-docs = vectorstore.similarity_search(question)
-len(docs)
-```
-
- 4
-
-### Go deeper
-
-Vectorstores are commonly used for retrieval, but they are not the only option. For example, SVMs (see thread [here](https://twitter.com/karpathy/status/1647025230546886658?s=20)) can also be used.
-
-LangChain [has many retrievers](/docs/modules/data_connection/retrievers/) including, but not limited to, vectorstores. All retrievers implement a common method `get_relevant_documents()` (and its asynchronous variant `aget_relevant_documents()`).
-
-```python
-from langchain.retrievers import SVMRetriever
-
-svm_retriever = SVMRetriever.from_documents(all_splits,OpenAIEmbeddings())
-docs_svm=svm_retriever.get_relevant_documents(question)
-len(docs_svm)
-```
-
- 4
-
-Some common ways to improve on vector similarity search include:
-- `MultiQueryRetriever` [generates variants of the input question](/docs/modules/data_connection/retrievers/MultiQueryRetriever) to improve retrieval.
-- `Max marginal relevance` selects for [relevance and diversity](https://www.cs.cmu.edu/~jgc/publication/The_Use_MMR_Diversity_Based_LTMIR_1998.pdf) among the retrieved documents.
-- Documents can be filtered during retrieval using [`metadata` filters](/docs/use_cases/question_answering/how_to/document-context-aware-QA).
-
-
-```python
-import logging
-
-from langchain.chat_models import ChatOpenAI
-from langchain.retrievers.multi_query import MultiQueryRetriever
-
-logging.basicConfig()
-logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
-
-retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(),
- llm=ChatOpenAI(temperature=0))
-unique_docs = retriever_from_llm.get_relevant_documents(query=question)
-len(unique_docs)
-```
-
- INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can Task Decomposition be approached?', '2. What are the different methods for Task Decomposition?', '3. What are the various approaches to decomposing tasks?']
- 5
-
-## Step 5. Generate
-
-Distill the retrieved documents into an answer using an LLM/Chat model (e.g., `gpt-3.5-turbo`) with `RetrievalQA` chain.
-
-```python
-from langchain.chains import RetrievalQA
-from langchain.chat_models import ChatOpenAI
-
-llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
-qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
-qa_chain({"query": question})
-```
-
- {
- 'query': 'What are the approaches to Task Decomposition?',
- 'result': 'The approaches to task decomposition include:\n\n1. Simple prompting: This approach involves using simple prompts or questions to guide the agent in breaking down a task into smaller subgoals. For example, the agent can be prompted with "Steps for XYZ" and asked to list the subgoals for achieving XYZ.\n\n2. Task-specific instructions: In this approach, task-specific instructions are provided to the agent to guide the decomposition process. For example, if the task is to write a novel, the agent can be instructed to "Write a story outline" as a subgoal.\n\n3. Human inputs: This approach involves incorporating human inputs in the task decomposition process. Humans can provide guidance, feedback, and suggestions to help the agent break down complex tasks into manageable subgoals.\n\nThese approaches aim to enable efficient handling of complex tasks by breaking them down into smaller, more manageable parts.'
- }
-
-Note, you can pass in an `LLM` or a `ChatModel` (like we did here) to the `RetrievalQA` chain.
-
-### Go deeper
-
-#### Choosing LLMs
-- Browse the > 55 LLM and chat model integrations [here](https://integrations.langchain.com/).
-- See further documentation on LLMs and chat models [here](/docs/modules/model_io/models/).
-- Use local LLMS: The popularity of [PrivateGPT](https://github.com/imartinez/privateGPT) and [GPT4All](https://github.com/nomic-ai/gpt4all) underscore the importance of running LLMs locally.
-Using `GPT4All` is as simple as [downloading the binary]((/docs/integrations/llms/gpt4all)) and then:
-
- from langchain.llms import GPT4All
- from langchain.chains import RetrievalQA
-
- llm = GPT4All(model="/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin",max_tokens=2048)
- qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
-
-#### Customizing the prompt
-
-The prompt in `RetrievalQA` chain can be easily customized.
-
-```python
-from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-
-template = """Use the following pieces of context to answer the question at the end.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Use three sentences maximum and keep the answer as concise as possible.
-Always say "thanks for asking!" at the end of the answer.
-{context}
-Question: {question}
-Helpful Answer:"""
-QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
-
-llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
-qa_chain = RetrievalQA.from_chain_type(
- llm,
- retriever=vectorstore.as_retriever(),
- chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
-)
-result = qa_chain({"query": question})
-result["result"]
-```
-
- 'The approaches to Task Decomposition are (1) using simple prompting by LLM, (2) using task-specific instructions, and (3) with human inputs. Thanks for asking!'
-
-
-#### Return source documents
-
-The full set of retrieved documents used for answer distillation can be returned using `return_source_documents=True`.
-
-```python
-from langchain.chains import RetrievalQA
-
-qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),
- return_source_documents=True)
-result = qa_chain({"query": question})
-print(len(result['source_documents']))
-result['source_documents'][0]
-```
-
- 4
- Document(page_content='Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en'})
-
-
-
-#### Return citations
-
-Answer citations can be returned using `RetrievalQAWithSourcesChain`.
-
-
-```python
-from langchain.chains import RetrievalQAWithSourcesChain
-
-qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=vectorstore.as_retriever())
-
-result = qa_chain({"question": question})
-result
-```
-
- {
- 'question': 'What are the approaches to Task Decomposition?',
- 'answer': 'The approaches to Task Decomposition include (1) using LLM with simple prompting, (2) using task-specific instructions, and (3) incorporating human inputs.\n',
- 'sources': 'https://lilianweng.github.io/posts/2023-06-23-agent/'
- }
-
-#### Customizing retrieved document processing
-
-Retrieved documents can be fed to an LLM for answer distillation in a few different ways.
-
-`stuff`, `refine`, `map-reduce`, and `map-rerank` chains for passing documents to an LLM prompt are well summarized [here](/docs/modules/chains/document/).
-
-`stuff` is commonly used because it simply "stuffs" all retrieved documents into the prompt.
-
-The [load_qa_chain](/docs/use_cases/question_answering/how_to/question_answering.html) is an easy way to pass documents to an LLM using these various approaches (e.g., see `chain_type`).
-
-
-```python
-from langchain.chains.question_answering import load_qa_chain
-
-chain = load_qa_chain(llm, chain_type="stuff")
-chain({"input_documents": unique_docs, "question": question},return_only_outputs=True)
-```
-
- {'output_text': 'The approaches to task decomposition include (1) using simple prompting to break down tasks into subgoals, (2) providing task-specific instructions to guide the decomposition process, and (3) incorporating human inputs for task decomposition.'}
-
-We can also pass the `chain_type` to `RetrievalQA`.
-
-
-```python
-qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),
- chain_type="stuff")
-result = qa_chain({"query": question})
-```
-
-In summary, the user can choose the desired level of abstraction for QA:
-
-
-
-## Step 6. Converse (Extension)
-
-To hold a conversation, a chain needs to be able to refer to past interactions. Chain `Memory` allows us to do this. To keep chat history, we can specify a Memory buffer to track the conversation inputs / outputs.
-
-```python
-from langchain.memory import ConversationBufferMemory
-
-memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-```
-
-The `ConversationalRetrievalChain` uses chat in the `Memory buffer`.
-
-```python
-from langchain.chains import ConversationalRetrievalChain
-
-retriever = vectorstore.as_retriever()
-chat = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
-```
-
-```python
-result = chat({"question": "What are some of the main ideas in self-reflection?"})
-result['answer']
-```
-
- "Some of the main ideas in self-reflection include:\n1. Iterative improvement: Self-reflection allows autonomous agents to improve by refining past action decisions and correcting mistakes.\n2. Trial and error: Self-reflection is crucial in real-world tasks where trial and error are inevitable.\n3. Two-shot examples: Self-reflection is created by showing pairs of failed trajectories and ideal reflections for guiding future changes in the plan.\n4. Working memory: Reflections are added to the agent's working memory, up to three, to be used as context for querying.\n5. Performance evaluation: Self-reflection involves continuously reviewing and analyzing actions, self-criticizing behavior, and reflecting on past decisions and strategies to refine approaches.\n6. Efficiency: Self-reflection encourages being smart and efficient, aiming to complete tasks in the least number of steps."
-
-The Memory buffer has context to resolve `"it"` ("self-reflection") in the below question.
-
-```python
-result = chat({"question": "How does the Reflexion paper handle it?"})
-result['answer']
-```
-
- "The Reflexion paper handles self-reflection by showing two-shot examples to the Learning Language Model (LLM). Each example consists of a failed trajectory and an ideal reflection that guides future changes in the agent's plan. These reflections are then added to the agent's working memory, up to a maximum of three, to be used as context for querying the LLM. This allows the agent to iteratively improve its reasoning skills by refining past action decisions and correcting previous mistakes."
-
-### Go deeper
-
-The [documentation](/docs/use_cases/question_answering/how_to/chat_vector_db) on `ConversationalRetrievalChain` offers a few extensions, such as streaming and source documents.
-
-
-## Further reading
-- Check out the [How to](/docs/use_cases/question_answer/how_to/) section for all the variations of chains that can be used for QA over docs in different settings.
-- Check out the [Integrations-specific](/docs/use_cases/question_answer/integrations/) section for chains that use specific integrations.
diff --git a/docs/extras/use_cases/question_answering/question_answering.ipynb b/docs/extras/use_cases/question_answering/question_answering.ipynb
new file mode 100644
index 000000000..035ea5e2b
--- /dev/null
+++ b/docs/extras/use_cases/question_answering/question_answering.ipynb
@@ -0,0 +1,686 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5151afed",
+ "metadata": {},
+ "source": [
+ "# Question Answering\n",
+ "\n",
+ "[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/question_answering/qa.ipynb)\n",
+ "\n",
+ "## Use case\n",
+ "Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. LLMs, given their proficiency in understanding text, are a great tool for this.\n",
+ "\n",
+ "In this walkthrough we'll go over how to build a question-answering over documents application using LLMs. Two very related use cases which we cover elsewhere are:\n",
+ "- [QA over structured data](/docs/use_cases/sql) (e.g., SQL)\n",
+ "- [QA over code](/docs/use_cases/code) (e.g., Python)\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Overview\n",
+ "The pipeline for converting raw unstructured data into a QA chain looks like this:\n",
+ "1. `Loading`: First we need to load our data. Unstructured data can be loaded from many sources. Use the [LangChain integration hub](https://integrations.langchain.com/) to browse the full set of loaders.\n",
+ "Each loader returns data as a LangChain [`Document`](/docs/components/schema/document).\n",
+ "2. `Splitting`: [Text splitters](/docs/modules/data_connection/document_transformers/) break `Documents` into splits of specified size\n",
+ "3. `Storage`: Storage (e.g., often a [vectorstore](/docs/modules/data_connection/vectorstores/)) will house [and often embed](https://www.pinecone.io/learn/vector-embeddings/) the splits\n",
+ "4. `Retrieval`: The app retrieves splits from storage (e.g., often [with similar embeddings](https://www.pinecone.io/learn/k-nearest-neighbor/) to the input question)\n",
+ "5. `Generation`: An [LLM](/docs/modules/model_io/models/llms/) produces an answer using a prompt that includes the question and the retrieved data\n",
+ "6. `Conversation` (Extension): Hold a multi-turn conversation by adding [Memory](/docs/modules/memory/) to your QA chain.\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Quickstart\n",
+ "\n",
+ "To give you a sneak preview, the above pipeline can be all be wrapped in a single object: `VectorstoreIndexCreator`. Suppose we want a QA app over this [blog post](https://lilianweng.github.io/posts/2023-06-23-agent/). We can create this in a few lines of code. First set environment variables and install packages:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e14b744b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pip install openai chromadb\n",
+ "\n",
+ "# Set env var OPENAI_API_KEY or load from a .env file\n",
+ "# import dotenv\n",
+ "\n",
+ "# dotenv.load_env()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "046cefc0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import WebBaseLoader\n",
+ "from langchain.indexes import VectorstoreIndexCreator\n",
+ "\n",
+ "loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")\n",
+ "index = VectorstoreIndexCreator().from_loaders([loader])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f4bf8740",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "' Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be done using LLM with simple prompting, task-specific instructions, or with human inputs. Tree of Thoughts (Yao et al. 2023) is an extension of Chain of Thought (Wei et al. 2022) which explores multiple reasoning possibilities at each step.'"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "index.query(\"What is Task Decomposition?\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8224aad6",
+ "metadata": {},
+ "source": [
+ "Ok, but what's going on under the hood, and how could we customize this for our specific use case? For that, let's take a look at how we can construct this pipeline piece by piece."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba5daed6",
+ "metadata": {},
+ "source": [
+ "## Step 1. Load\n",
+ "\n",
+ "Specify a `DocumentLoader` to load in your unstructured data as `Documents`. A `Document` is a piece of text (the `page_content`) and associated metadata."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "cf4d5c72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import WebBaseLoader\n",
+ "\n",
+ "loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")\n",
+ "data = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd2cc9a7",
+ "metadata": {},
+ "source": [
+ "### Go deeper\n",
+ "- Browse the > 120 data loader integrations [here](https://integrations.langchain.com/).\n",
+ "- See further documentation on loaders [here](/docs/modules/data_connection/document_loaders/).\n",
+ "\n",
+ "## Step 2. Split\n",
+ "\n",
+ "Split the `Document` into chunks for embedding and vector storage."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "4b11c01d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "\n",
+ "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)\n",
+ "all_splits = text_splitter.split_documents(data)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a33bd4d",
+ "metadata": {},
+ "source": [
+ "### Go deeper\n",
+ "\n",
+ "- `DocumentSplitters` are just one type of the more generic `DocumentTransformers`, which can all be useful in this preprocessing step.\n",
+ "- See further documentation on transformers [here](/docs/modules/data_connection/document_transformers/).\n",
+ "- `Context-aware splitters` keep the location (\"context\") of each split in the original `Document`:\n",
+ " - [Markdown files](/docs/use_cases/question_answering/how_to/document-context-aware-QA)\n",
+ " - [Code (py or js)](docs/integrations/document_loaders/source_code)\n",
+ " - [Documents](/docs/integrations/document_loaders/grobid)\n",
+ "\n",
+ "## Step 3. Store\n",
+ "\n",
+ "To be able to look up our document splits, we first need to store them where we can later look them up.\n",
+ "The most common way to do this is to embed the contents of each document then store the embedding and document in a vector store, with the embedding being used to index the document."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e9c302c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.embeddings import OpenAIEmbeddings\n",
+ "from langchain.vectorstores import Chroma\n",
+ "\n",
+ "vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc6f22b0",
+ "metadata": {},
+ "source": [
+ "### Go deeper\n",
+ "- Browse the > 40 vectorstores integrations [here](https://integrations.langchain.com/).\n",
+ "- See further documentation on vectorstores [here](/docs/modules/data_connection/vectorstores/).\n",
+ "- Browse the > 30 text embedding integrations [here](https://integrations.langchain.com/).\n",
+ "- See further documentation on embedding models [here](/docs/modules/data_connection/text_embedding/).\n",
+ "\n",
+ " Here are Steps 1-3:\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Step 4. Retrieve\n",
+ "\n",
+ "Retrieve relevant splits for any question using [similarity search](https://www.pinecone.io/learn/what-is-similarity-search/)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e2c26b7d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "question = \"What are the approaches to Task Decomposition?\"\n",
+ "docs = vectorstore.similarity_search(question)\n",
+ "len(docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d5a113b",
+ "metadata": {},
+ "source": [
+ "### Go deeper\n",
+ "\n",
+ "Vectorstores are commonly used for retrieval, but they are not the only option. For example, SVMs (see thread [here](https://twitter.com/karpathy/status/1647025230546886658?s=20)) can also be used.\n",
+ "\n",
+ "LangChain [has many retrievers](/docs/modules/data_connection/retrievers/) including, but not limited to, vectorstores. All retrievers implement a common method `get_relevant_documents()` (and its asynchronous variant `aget_relevant_documents()`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "c901eaee",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.retrievers import SVMRetriever\n",
+ "\n",
+ "svm_retriever = SVMRetriever.from_documents(all_splits,OpenAIEmbeddings())\n",
+ "docs_svm=svm_retriever.get_relevant_documents(question)\n",
+ "len(docs_svm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69de3d54",
+ "metadata": {},
+ "source": [
+ "Some common ways to improve on vector similarity search include:\n",
+ "- `MultiQueryRetriever` [generates variants of the input question](/docs/modules/data_connection/retrievers/MultiQueryRetriever) to improve retrieval.\n",
+ "- `Max marginal relevance` selects for [relevance and diversity](https://www.cs.cmu.edu/~jgc/publication/The_Use_MMR_Diversity_Based_LTMIR_1998.pdf) among the retrieved documents.\n",
+ "- Documents can be filtered during retrieval using [`metadata` filters](/docs/use_cases/question_answering/how_to/document-context-aware-QA)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "c690f01a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can Task Decomposition be approached?', '2. What are the different methods for Task Decomposition?', '3. What are the various approaches to decomposing tasks?']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "4"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import logging\n",
+ "\n",
+ "from langchain.chat_models import ChatOpenAI\n",
+ "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
+ "\n",
+ "logging.basicConfig()\n",
+ "logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)\n",
+ "\n",
+ "retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(),\n",
+ " llm=ChatOpenAI(temperature=0))\n",
+ "unique_docs = retriever_from_llm.get_relevant_documents(query=question)\n",
+ "len(unique_docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "415d6824",
+ "metadata": {},
+ "source": [
+ "## Step 5. Generate\n",
+ "\n",
+ "Distill the retrieved documents into an answer using an LLM/Chat model (e.g., `gpt-3.5-turbo`) with `RetrievalQA` chain.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "99fa1aec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'query': 'What are the approaches to Task Decomposition?',\n",
+ " 'result': 'There are three approaches to task decomposition:\\n\\n1. Using Language Model with simple prompting: This approach involves using a Language Model (LLM) with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to guide the task decomposition process.\\n\\n2. Using task-specific instructions: In this approach, task-specific instructions are provided to guide the task decomposition. For example, for the task of writing a novel, an instruction like \"Write a story outline\" can be given to help decompose the task into smaller subtasks.\\n\\n3. Human inputs: Task decomposition can also be done with the help of human inputs. This involves getting input and guidance from humans to break down a complex task into smaller, more manageable subtasks.'}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.chains import RetrievalQA\n",
+ "from langchain.chat_models import ChatOpenAI\n",
+ "\n",
+ "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
+ "qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())\n",
+ "qa_chain({\"query\": question})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7d52c84",
+ "metadata": {},
+ "source": [
+ "Note, you can pass in an `LLM` or a `ChatModel` (like we did here) to the `RetrievalQA` chain.\n",
+ "\n",
+ "### Go deeper\n",
+ "\n",
+ "#### Choosing LLMs\n",
+ "- Browse the > 55 LLM and chat model integrations [here](https://integrations.langchain.com/).\n",
+ "- See further documentation on LLMs and chat models [here](/docs/modules/model_io/models/).\n",
+ "- Use local LLMS: The popularity of [PrivateGPT](https://github.com/imartinez/privateGPT) and [GPT4All](https://github.com/nomic-ai/gpt4all) underscore the importance of running LLMs locally.\n",
+ "Using `GPT4All` is as simple as [downloading the binary]((/docs/integrations/llms/gpt4all)) and then:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "02d6c9dc",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found model file at /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "objc[61331]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2e3384208) and /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x2e37b0208). One of the two will be used. Which one is undefined.\n",
+ "llama.cpp: using Metal\n",
+ "llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
+ "llama_model_load_internal: format = ggjt v3 (latest)\n",
+ "llama_model_load_internal: n_vocab = 32001\n",
+ "llama_model_load_internal: n_ctx = 2048\n",
+ "llama_model_load_internal: n_embd = 5120\n",
+ "llama_model_load_internal: n_mult = 256\n",
+ "llama_model_load_internal: n_head = 40\n",
+ "llama_model_load_internal: n_layer = 40\n",
+ "llama_model_load_internal: n_rot = 128\n",
+ "llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
+ "llama_model_load_internal: n_ff = 13824\n",
+ "llama_model_load_internal: n_parts = 1\n",
+ "llama_model_load_internal: model size = 13B\n",
+ "llama_model_load_internal: ggml ctx size = 0.09 MB\n",
+ "llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
+ "llama_new_context_with_model: kv self size = 1600.00 MB\n",
+ "ggml_metal_init: allocating\n",
+ "ggml_metal_init: using MPS\n",
+ "ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
+ "ggml_metal_init: loaded kernel_add 0x2bbbbc2f0\n",
+ "ggml_metal_init: loaded kernel_mul 0x2bbbba840\n",
+ "ggml_metal_init: loaded kernel_mul_row 0x2bb917dd0\n",
+ "ggml_metal_init: loaded kernel_scale 0x2bb918150\n",
+ "ggml_metal_init: loaded kernel_silu 0x2bb9184d0\n",
+ "ggml_metal_init: loaded kernel_relu 0x2bb918850\n",
+ "ggml_metal_init: loaded kernel_gelu 0x2bbbc3f10\n",
+ "ggml_metal_init: loaded kernel_soft_max 0x2bbbc5840\n",
+ "ggml_metal_init: loaded kernel_diag_mask_inf 0x2bbbc4c70\n",
+ "ggml_metal_init: loaded kernel_get_rows_f16 0x2bbbc5fc0\n",
+ "ggml_metal_init: loaded kernel_get_rows_q4_0 0x2bbbc6720\n",
+ "ggml_metal_init: loaded kernel_get_rows_q4_1 0x2bb918c10\n",
+ "ggml_metal_init: loaded kernel_get_rows_q2_k 0x2bbbc51b0\n",
+ "ggml_metal_init: loaded kernel_get_rows_q3_k 0x2bbbc7630\n",
+ "ggml_metal_init: loaded kernel_get_rows_q4_k 0x2d4394e30\n",
+ "ggml_metal_init: loaded kernel_get_rows_q5_k 0x2bbbc7890\n",
+ "ggml_metal_init: loaded kernel_get_rows_q6_k 0x2d4395210\n",
+ "ggml_metal_init: loaded kernel_rms_norm 0x2bbbc8740\n",
+ "ggml_metal_init: loaded kernel_norm 0x2bbbc8b30\n",
+ "ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x2d4395470\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x2d4395a70\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x1242b1a00\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q2_k_f32 0x29f17d1c0\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q3_k_f32 0x2d4396050\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q4_k_f32 0x2bbbc98a0\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q5_k_f32 0x2bbbca4a0\n",
+ "ggml_metal_init: loaded kernel_mul_mat_q6_k_f32 0x2bbbcae90\n",
+ "ggml_metal_init: loaded kernel_rope 0x2bbbca700\n",
+ "ggml_metal_init: loaded kernel_alibi_f32 0x2bbbcc6e0\n",
+ "ggml_metal_init: loaded kernel_cpy_f32_f16 0x2bbbccf90\n",
+ "ggml_metal_init: loaded kernel_cpy_f32_f32 0x2bbbcd900\n",
+ "ggml_metal_init: loaded kernel_cpy_f16_f16 0x2bbbce1f0\n",
+ "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
+ "ggml_metal_init: hasUnifiedMemory = true\n",
+ "ggml_metal_init: maxTransferRate = built-in GPU\n",
+ "ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6984.45 / 21845.34)\n",
+ "ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1024.00 MB, ( 8008.45 / 21845.34)\n",
+ "ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9610.45 / 21845.34)\n",
+ "ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB, (10122.45 / 21845.34)\n",
+ "ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10634.45 / 21845.34)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain.llms import GPT4All\n",
+ "from langchain.chains import RetrievalQA\n",
+ "\n",
+ "llm = GPT4All(model=\"/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\",max_tokens=2048)\n",
+ "qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa82f437",
+ "metadata": {},
+ "source": [
+ "#### Customizing the prompt\n",
+ "\n",
+ "The prompt in `RetrievalQA` chain can be easily customized."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "e4fee704",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ggml_metal_free: deallocating\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, or human inputs. Thanks for asking!'"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.chains import RetrievalQA\n",
+ "from langchain.prompts import PromptTemplate\n",
+ "\n",
+ "template = \"\"\"Use the following pieces of context to answer the question at the end. \n",
+ "If you don't know the answer, just say that you don't know, don't try to make up an answer. \n",
+ "Use three sentences maximum and keep the answer as concise as possible. \n",
+ "Always say \"thanks for asking!\" at the end of the answer. \n",
+ "{context}\n",
+ "Question: {question}\n",
+ "Helpful Answer:\"\"\"\n",
+ "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n",
+ "\n",
+ "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
+ "qa_chain = RetrievalQA.from_chain_type(\n",
+ " llm,\n",
+ " retriever=vectorstore.as_retriever(),\n",
+ " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n",
+ ")\n",
+ "result = qa_chain({\"query\": question})\n",
+ "result[\"result\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff40e8db",
+ "metadata": {},
+ "source": [
+ "#### Return source documents\n",
+ "\n",
+ "The full set of retrieved documents used for answer distillation can be returned using `return_source_documents=True`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "60004293",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Document(page_content='Task decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\\\n1.\", \"What are the subgoals for achieving XYZ?\", (2) by using task-specific instructions; e.g. \"Write a story outline.\" for writing a novel, or (3) with human inputs.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': \"LLM Powered Autonomous Agents | Lil'Log\"})"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.chains import RetrievalQA\n",
+ "\n",
+ "qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),\n",
+ " return_source_documents=True)\n",
+ "result = qa_chain({\"query\": question})\n",
+ "print(len(result['source_documents']))\n",
+ "result['source_documents'][0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1b600236",
+ "metadata": {},
+ "source": [
+ "#### Return citations\n",
+ "\n",
+ "Answer citations can be returned using `RetrievalQAWithSourcesChain`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "948f6d19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'question': 'What are the approaches to Task Decomposition?',\n",
+ " 'answer': 'The approaches to Task Decomposition include:\\n1. Using LLM with simple prompting, such as providing steps or subgoals for achieving a task.\\n2. Using task-specific instructions, such as providing a specific instruction like \"Write a story outline\" for writing a novel.\\n3. Using human inputs to decompose the task.\\nAnother approach is the Tree of Thoughts, which extends the Chain of Thought (CoT) technique by exploring multiple reasoning possibilities at each step and generating multiple thoughts per step, creating a tree structure. The search process can be BFS or DFS, and each state can be evaluated by a classifier or majority vote.\\nSources: https://lilianweng.github.io/posts/2023-06-23-agent/',\n",
+ " 'sources': ''}"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.chains import RetrievalQAWithSourcesChain\n",
+ "\n",
+ "qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=vectorstore.as_retriever())\n",
+ "\n",
+ "result = qa_chain({\"question\": question})\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73d0b138",
+ "metadata": {},
+ "source": [
+ "#### Customizing retrieved document processing\n",
+ "\n",
+ "Retrieved documents can be fed to an LLM for answer distillation in a few different ways.\n",
+ "\n",
+ "`stuff`, `refine`, `map-reduce`, and `map-rerank` chains for passing documents to an LLM prompt are well summarized [here](/docs/modules/chains/document/).\n",
+ " \n",
+ "`stuff` is commonly used because it simply \"stuffs\" all retrieved documents into the prompt.\n",
+ "\n",
+ "The [load_qa_chain](/docs/use_cases/question_answering/how_to/question_answering.html) is an easy way to pass documents to an LLM using these various approaches (e.g., see `chain_type`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "29aa139f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'output_text': 'The approaches to task decomposition mentioned in the provided context are:\\n\\n1. Chain of thought (CoT): This approach involves instructing the language model to \"think step by step\" and decompose complex tasks into smaller and simpler steps. It enhances model performance on complex tasks by utilizing more test-time computation.\\n\\n2. Tree of Thoughts: This approach extends CoT by exploring multiple reasoning possibilities at each step. It decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS or DFS, and each state is evaluated by a classifier or majority vote.\\n\\n3. LLM with simple prompting: This approach involves using a language model with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to perform task decomposition.\\n\\n4. Task-specific instructions: This approach involves providing task-specific instructions to guide the language model in decomposing the task. For example, providing the instruction \"Write a story outline\" for the task of writing a novel.\\n\\n5. Human inputs: Task decomposition can also be done with human inputs, where humans provide guidance and input to break down the task into smaller subtasks.'}"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from langchain.chains.question_answering import load_qa_chain\n",
+ "\n",
+ "chain = load_qa_chain(llm, chain_type=\"stuff\")\n",
+ "chain({\"input_documents\": unique_docs, \"question\": question},return_only_outputs=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8cb8cd1",
+ "metadata": {},
+ "source": [
+ "We can also pass the `chain_type` to `RetrievalQA`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "f68574bd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),\n",
+ " chain_type=\"stuff\")\n",
+ "result = qa_chain({\"query\": question})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b33aeb5f",
+ "metadata": {},
+ "source": [
+ "In summary, the user can choose the desired level of abstraction for QA:\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Step 6. Chat\n",
+ "\n",
+ "See our [use-case on chat](/docs/use_cases/chatbots) for detail on this!"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 2ab04a4e32d857d221f012a96b40b5549bdeadf1 Mon Sep 17 00:00:00 2001
From: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Date: Fri, 25 Aug 2023 11:28:55 -0700
Subject: [PATCH 131/143] Update agent docs, move to use-case sub-directory
(#9344)
Re-structure and add new agent page
---
docs/docs_skeleton/static/img/ReAct.png | Bin 0 -> 42674 bytes
.../static/img/agents_use_case_1.png | Bin 0 -> 241979 bytes
.../static/img/agents_use_case_trace_1.png | Bin 0 -> 75923 bytes
.../static/img/agents_use_case_trace_2.png | Bin 0 -> 169721 bytes
.../static/img/agents_vs_chains.png | Bin 0 -> 42731 bytes
.../static/img/oai_function_agent.png | Bin 0 -> 181119 bytes
docs/extras/use_cases/agents/baby_agi.ipynb | 565 --------------
.../agents/baby_agi_with_agent.ipynb | 647 ----------------
.../camel_role_playing.ipynb | 0
.../agent_simulations/characters.ipynb | 0
.../agents}/agent_simulations/gymnasium.ipynb | 0
.../agents}/agent_simulations/index.mdx | 0
.../agent_simulations/multi_player_dnd.ipynb | 0
.../multiagent_authoritarian.ipynb | 0
.../multiagent_bidding.ipynb | 0
.../agent_simulations/petting_zoo.ipynb | 0
.../two_agent_debate_tools.ipynb | 0
.../agent_simulations/two_player_dnd.ipynb | 0
.../extras/use_cases/more/agents/agents.ipynb | 718 ++++++++++++++++++
.../agents}/agents/camel_role_playing.ipynb | 0
.../custom_agent_with_plugin_retrieval.ipynb | 0
...ith_plugin_retrieval_using_plugnplai.ipynb | 0
.../{ => more/agents}/agents/index.mdx | 0
.../agents/multi_modal_output_agent.ipynb | 298 ++++++++
.../agents/sales_agent_with_context.ipynb | 0
.../agents}/agents/wikibase_agent.ipynb | 0
.../agents}/autonomous_agents/autogpt.ipynb | 0
.../agents}/autonomous_agents/baby_agi.ipynb | 2 +-
.../baby_agi_with_agent.ipynb | 0
.../autonomous_agents/hugginggpt.ipynb | 0
.../agents}/autonomous_agents/index.mdx | 0
.../autonomous_agents/marathon_times.ipynb | 0
.../autonomous_agents/meta_prompt.ipynb | 0
.../agents}/multi_modal/_category_.yml | 0
.../_image_agent_files}/output_10_1.png | Bin
.../output_10_1.png | Bin
.../output_15_1.png | Bin
.../agents}/multi_modal/image_agent.ipynb | 2 +-
.../multi_modal_output_agent.ipynb | 0
39 files changed, 1018 insertions(+), 1214 deletions(-)
create mode 100644 docs/docs_skeleton/static/img/ReAct.png
create mode 100644 docs/docs_skeleton/static/img/agents_use_case_1.png
create mode 100644 docs/docs_skeleton/static/img/agents_use_case_trace_1.png
create mode 100644 docs/docs_skeleton/static/img/agents_use_case_trace_2.png
create mode 100644 docs/docs_skeleton/static/img/agents_vs_chains.png
create mode 100644 docs/docs_skeleton/static/img/oai_function_agent.png
delete mode 100644 docs/extras/use_cases/agents/baby_agi.ipynb
delete mode 100644 docs/extras/use_cases/agents/baby_agi_with_agent.ipynb
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/camel_role_playing.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/characters.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/gymnasium.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/index.mdx (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/multi_player_dnd.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/multiagent_authoritarian.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/multiagent_bidding.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/petting_zoo.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/two_agent_debate_tools.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agent_simulations/two_player_dnd.ipynb (100%)
create mode 100644 docs/extras/use_cases/more/agents/agents.ipynb
rename docs/extras/use_cases/{ => more/agents}/agents/camel_role_playing.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agents/custom_agent_with_plugin_retrieval.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agents/custom_agent_with_plugin_retrieval_using_plugnplai.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agents/index.mdx (100%)
create mode 100644 docs/extras/use_cases/more/agents/agents/multi_modal_output_agent.ipynb
rename docs/extras/use_cases/{ => more/agents}/agents/sales_agent_with_context.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/agents/wikibase_agent.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/autogpt.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/baby_agi.ipynb (99%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/baby_agi_with_agent.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/hugginggpt.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/index.mdx (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/marathon_times.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/autonomous_agents/meta_prompt.ipynb (100%)
rename docs/extras/use_cases/{ => more/agents}/multi_modal/_category_.yml (100%)
rename docs/extras/use_cases/{agents/_multi_modal_output_agent_files => more/agents/multi_modal/_image_agent_files}/output_10_1.png (100%)
rename docs/extras/use_cases/{multi_modal/_image_agent_files => more/agents/multi_modal/_multi_modal_output_agent_files}/output_10_1.png (100%)
rename docs/extras/use_cases/{agents => more/agents/multi_modal}/_multi_modal_output_agent_files/output_15_1.png (100%)
rename docs/extras/use_cases/{ => more/agents}/multi_modal/image_agent.ipynb (99%)
rename docs/extras/use_cases/{agents => more/agents/multi_modal}/multi_modal_output_agent.ipynb (100%)
diff --git a/docs/docs_skeleton/static/img/ReAct.png b/docs/docs_skeleton/static/img/ReAct.png
new file mode 100644
index 0000000000000000000000000000000000000000..f4e4260a8435bfd4ea771250f97e62ff7c9d17d2
GIT binary patch
literal 42674
zcmeFZWn7e7+dd3PBMPD*AR#GIN-C{{G>STOcMRP<#$^c$Ny(>jl9KdK9c)c4tW7X5WZ%T7;i*4wB~Q_c3=b!v_w$SH
zNu*c3BukKn6RwL*eVc$p{v9mU@Cn5Y?W$luVx1e$R{e%ag+mNCLU3-F=x_|j2(qQF
z)Na)r)K+Xy`F0&IbX4%}O=9}_B(X-!eoDZYeM6LCp9bsbWy@CVdoF?PB!L5w2x;z)
zD=;!-z~s2JHq~CWf)P=sc56=aZ1pT(T7kY};gVmIL=soa!os6!I*jTB?g(-WiRf+e
zC?Y0@%j`W7-!N|L(l^J;YjZX)$yaL`M3FSZ%5Pxs-wk8!#FqGVZ8rY(vErAsUZTQc
zb^2?UB+MSK#>ELf!?34nD(#WMD%AcMa~hyLKgHLN$M0U<@hCSbmr1%WXM%)}v4-IVidrd>_=4?jO1XH*V*8>mkG^C|=E8?9k lGxk4$at&LP4e3Ab|5=;GQ-_z7Kz-en^*=ucGYdB0ZxWV
zZ4{&7x91989&4n$m{D6_spVjefcN<(GtYkAx&QuC?knS9HWDrJw~?Rdk$%H$;;dqi
zO?Zp$gpl#;Gsb*x{Z;nrf%2Uz#4nbOUQ_mkKZxtOWaPkF^X
z!53YLlB=BsIR{
zL3D%Bp`I*wTLQb~^&ZdnJDaIxOuujPeDZZ$XlM)Cod0%r<81F3<0#ocl124<3$6&y
z(xuoYeM6G_?`}RWX(YB6AIA%OHH4wAV@JNue=ls~&nwnfSH4MpYvT_2AZ~F6#T6GJ
zj-eTee-{}PeyOEYJZzv~$d2ws#KK|o@f{;2x(-3Jf(5g)B)TGQKWI~T>@b!!oeQoo
zEV=QD(R;r)G>r)b0;|EduIA-6za8YC=2RwCxiUa{(l}Kg6=TLTRnb*3b`VmOFY!S7
zG!!@KO=Py}GgAm=6mcE?V-~dzSE?UQ_Ad4~5~tqJYHKH(?$%NV>yEkFtkYra)2?i6
zi0>p5iR0GOs+6Bx^;xji_}!m&rG6y9E577u>&zArW-argT^1zEM>)md8lWQ-i-lJ5+bCe*sb4j7$?o$F}fRaA-Fu@;x8M~WF7+DQCb_Cj&
zT&I_pu(&0--SEU1@gF}n$G>!6LXnic3Df?OdNAF!cTx=YvV@U=#tep1fr?M&8BAIn
zGOmpT4={L!2ec5(-s*#mKc=OhXtA0ldz<#+1s=Z)ym#gfkx4qKIhN3sPAS$j=|vHz
z`qoO`yvs?jEtr`pX|^O#>eqa=Z(WHp*7VW%+Jun5q5{5-@1g`TL<>
zgHR#uDx-1}`{4Nc?fsm>=kq6RfK4k+Uy~u@
zA&4Sie&z6u{2S&sw(_`2{Ec9#Pqe*(iv)Xyg!dC4ye|(fk1Y>zOB#OUCAG>JA&bj|
z7s<_*(8PEnTwc~nR#>)KHcn0_Sxap_aV-cQ3%z1Rq%t=n$N<7B2dymg4%lF`wb(4J5bh3I66
zY1KUTE<4oN#apJ!9)00?H%KVZc?5Ws?^ikn3ke8m
zCtD};Bqs~43Ds1NR@zj`RX%shaSB^>SvK%MdT7^Z*D!l{?&hs{QyGf95|w&jJQIF+
zJ1(eti0&(0D)#{Q;8dos(peN{Km9O$vwr-e^afGM@t(ZYV>B4>Q~ecQ5FvtOqLNAcBRk+r$ipb6GiFI
z>`GzrVrI)cBOThNIK?p|R;I?}rYyW;=17}ABjZ0rdaH-M;toWK-r}?~G#X|~he+pV
zX3yPeUy5~(ErYi}IC4001|ep-lUifytXkb=HYGl$>9&mswxaCf3M+-;-Lhn3EhEpt
zQ1j-Z_ChTy*H&7DYguWk_N`}K0&
zY<=0o*}Y41cI`9Sjd8@{yvK>fRgQVvrVGN1B~M<3-J$Ij`H2XrA)3l;oBPV%$=~E0
zYZbM^E{G?XHYQY-xPE!>+4}Z`)_9$Vn}@9EvS_6!r)a-Oj;L*clluK!>D*#n?0kuQ
z4c(v`F%M5S3b&2j)Ar_G)@HM+?&1gm#@CF3VP3OV>+RbUi(^a8I||!l(?d(P^WEF|
zo4z;oN$K&2ul*_cE#{u#IqXCEjvHJQT-t%lG
z9rCV{+T;dvsO3A{-tom7y?ch%hAiK2y=DIKqY3g|>;Rn~8bdhNXHrBJ>6T#8L8J7XYYfHT|M$iF$4s-GX}
zZQVM&^2_>%5~Zt6_00bG<`b44*553v3XB=dSx}XuZz1!8zYqH=mIjwRDYjO)r8*(`;yYh?~T3Zpv_6JHhLcgEXDw(n$N&iXh
z$uIc&yKy$7_2{#?5~+$sqNTX=%s`g(ZRt9t#;naOgrZ=gnAu?fS!qYn
z2Yy?~&}z@f*!Sp}Ppz4$sG%vy<(
zRPVHvZ~R1cF@*BnaS!8S;=DDQ^qfwINb4ZSqje9A?2oR6F|Wj4i+dZVt2bX~J>~$P
zlWwPj-$Z;eV0JWFYAefO$rVytht&G;n;@*6&7_T``>ERb6^6XBJHA#!Zm1)2M!$F%
z>>eTf5phE}xjA~a)wR|qS;U`+1qB0btiKfIu}w*hzbId7ozLhr^D#PsRDHKo7>Ui-
z`mQBh_8n>k-_3D!@y=-7pjuG$XO3sUoMo^Xy-XCLzXUH?j10&r;5=<2*~zxPgDnnk&M1
z_bndA?(Z|vQoPfNaSZ)EQ@3OP!<|BmrXCF0-&kLAR`h8eLg
zf-Eqw|M`u9!2mtb4GPg>o4f+Zvj1
zxk2qv7hs6E34xzb6DI?DH>kCZqmY{@!#`IDf#0aZ+zj;pT;gOU%Al_FlwQ)-!GvCb
z>mJuV1~EcoDAGJZ5$c@9pvvgQYMZ@4iP=^oxuBW{bgzWreqiWz1bv3
zTujnu5+qXoZ{HfK-_W`AX}y;Jla=Pq!N5yQX|-pH?7{fBJa->1Br#ukxUkbz(F5&P
z9O?-F9I}3PQX8L_r={b)qBb#@Wi#Q)KEbB+_%Q}1Hi^VPf7CZ)EqQATybZ*_!l(E9
z=MO(Jdh@w}e}AJ6OnYMbL#8$u2!qeBk?3hNyD-Kc7B;+gO8Pp<`LKSeG2Q1^`}u9)
z;A?u