From 64ed6b40e252fd06f8c2b76ce93e5255123afb31 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Fri, 22 Dec 2023 15:24:56 -0500 Subject: [PATCH] Improve tutorial quality (#298) --- .gitignore | 1 + tutorials/1.introduction.ipynb | 180 +++------ tutorials/2.tips.ipynb | 354 ++++++---------- tutorials/3.0.applications-rag.ipynb | 578 ++++++++++++++++++++------- tutorials/helpers.py | 32 ++ wandb/settings | 5 + 6 files changed, 639 insertions(+), 511 deletions(-) create mode 100644 tutorials/helpers.py create mode 100644 wandb/settings diff --git a/.gitignore b/.gitignore index db21ed7..ffd26e8 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,4 @@ tutorials/results.csv tutorials/results.jsonl tutorials/results.jsonlines tutorials/schema.json +wandb/settings \ No newline at end of file diff --git a/tutorials/1.introduction.ipynb b/tutorials/1.introduction.ipynb index 0d23fca..e702ed6 100644 --- a/tutorials/1.introduction.ipynb +++ b/tutorials/1.introduction.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -40,30 +40,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jason is 10\n", - "None is 10\n", - "Next year Jason will be 11 years old\n" - ] - }, - { - "ename": "TypeError", - "evalue": "can only concatenate str (not \"int\") to str", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/1.introduction.ipynb Cell 5\u001b[0m line \u001b[0;36m9\n\u001b[1;32m 7\u001b[0m name \u001b[39m=\u001b[39m obj\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mfirst_name\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m age \u001b[39m=\u001b[39m obj\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mage\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 9\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNext year \u001b[39m\u001b[39m{\u001b[39;00mname\u001b[39m}\u001b[39;00m\u001b[39m will be \u001b[39m\u001b[39m{\u001b[39;00mage\u001b[39m+\u001b[39;49m\u001b[39m1\u001b[39;49m\u001b[39m}\u001b[39;00m\u001b[39m years old\u001b[39m\u001b[39m\"\u001b[39m)\n", - "\u001b[0;31mTypeError\u001b[0m: can only concatenate str (not \"int\") to str" - ] - } - ], + "outputs": [], "source": [ "for obj in data:\n", " name = obj.get(\"first_name\")\n", @@ -94,20 +73,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Person(name='Sam', age=30)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from pydantic import BaseModel, Field\n", "\n", @@ -123,20 +91,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Person(name='Sam', age=30)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Data is correctly casted to the right type\n", "person = Person.model_validate({\"name\": \"Sam\", \"age\": \"30\"})\n", @@ -145,21 +102,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/1.introduction.ipynb Cell 10\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m \u001b[39massert\u001b[39;00m person\u001b[39m.\u001b[39mname \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mSam\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[39massert\u001b[39;00m person\u001b[39m.\u001b[39mage \u001b[39m==\u001b[39m \u001b[39m20\u001b[39m\n", - "\u001b[0;31mAssertionError\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "assert person.name == \"Sam\"\n", "assert person.age == 20" @@ -167,25 +112,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ValidationError", - "evalue": "1 validation error for Person\nage\n Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='30.2', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/int_parsing", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/1.introduction.ipynb Cell 11\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m \u001b[39m# Data is validated to get better error messages\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m person \u001b[39m=\u001b[39m Person\u001b[39m.\u001b[39;49mmodel_validate({\u001b[39m\"\u001b[39;49m\u001b[39mname\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39mSam\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mage\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39m30.2\u001b[39;49m\u001b[39m\"\u001b[39;49m})\n\u001b[1;32m 3\u001b[0m person\n", - "File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:503\u001b[0m, in \u001b[0;36mBaseModel.model_validate\u001b[0;34m(cls, obj, strict, from_attributes, context)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 502\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 503\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(\n\u001b[1;32m 504\u001b[0m obj, strict\u001b[39m=\u001b[39;49mstrict, from_attributes\u001b[39m=\u001b[39;49mfrom_attributes, context\u001b[39m=\u001b[39;49mcontext\n\u001b[1;32m 505\u001b[0m )\n", - "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Person\nage\n Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='30.2', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/int_parsing" - ] - } - ], + "outputs": [], "source": [ "# Data is validated to get better error messages\n", - "person = Person.model_validate({\"name\": \"Sam\", \"age\": \"30.2\"})\n", + "person = Person.model_validate({\"first_name\": \"Sam\", \"age\": \"30.2\"})\n", "person" ] }, @@ -202,41 +134,38 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Asking for JSON from OpenAI\n" + "## Fundamental problem with asking for JSON from OpenAI\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"Jason\": {\n", - " \"age\": 10\n", - " }\n", - "}\n", - "Here is the JSON representation of `jason is 10` as a JSON object:\n", - "\n", - "```\n", - "{\n", - " \"name\": \"Jason\",\n", - " \"age\": 10\n", - "}\n", - "```\n", - "Here is the JSON object representation of \"Jason is 10\":\n", - "\n", - "```json\n", - "{\n", - " \"name\": \"Jason\",\n", - " \"age\": 10\n", - "}\n", - "```\n", - "\n", - "In this JSON object, the key \"name\" corresponds to the value \"Jason\" and the key \"age\" corresponds to the value 10.\n" + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n", + "correctly parsed person=Person(name='Jason', age=10)\n" ] } ], @@ -248,7 +177,7 @@ "resp = client.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", - " {\"role\": \"user\", \"content\": \"Please give me jason is 10 as a json object\"},\n", + " {\"role\": \"user\", \"content\": \"Please give me jason is 10 as a json object ```json\\n\"},\n", " ],\n", " n=20,\n", " temperature=1,\n", @@ -257,8 +186,10 @@ "for choice in resp.choices:\n", " json = choice.message.content\n", " try:\n", - " Person.model_validate_json(json)\n", + " person = Person.model_validate_json(json)\n", + " print(f\"correctly parsed {person=}\")\n", " except Exception as e:\n", + " print(\"error!!\")\n", " print(json)" ] }, @@ -277,16 +208,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "PersonBirthday(name='Jason Liu', age=30, birthday=datetime.date(2023, 11, 30))" + "PersonBirthday(name='Jason Liu', age=30, birthday=datetime.date(2023, 12, 19))" ] }, - "execution_count": 13, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -335,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -349,7 +280,7 @@ " 'type': 'object'}" ] }, - "execution_count": 14, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -367,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -390,7 +321,7 @@ " 'type': 'object'}" ] }, - "execution_count": 15, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -425,14 +356,14 @@ "source": [ "# The core idea around Instructor\n", "\n", - "1. Using function calling allows us to specify the schema we want\n", - "2. Pydantic can be used to define the schema and documentation AND validate the response at runtime\n", + "1. Using function calling allows us use a llm that is finetuned to use json_schema and output json.\n", + "2. Pydantic can be used to define the object, schema, and validation in one single class, allow us to encapsulate everything neatly\n", "3. As a library with 100M downloads, we can leverage pydantic to do all the heavy lifting for us and fit nicely with the python ecosystem\n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -441,7 +372,7 @@ "PersonAddress(name='Jason Liu', age=30, address=Address(address='123 Main St', city='San Francisco', state='CA'))" ] }, - "execution_count": 16, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -451,10 +382,10 @@ "import datetime\n", "\n", "# patch the client to add `response_model` to the `create` method\n", - "client = instructor.patch(client)\n", + "client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)\n", "\n", "resp = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", + " model=\"gpt-3.5-turbo-1106\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", @@ -484,7 +415,7 @@ "source": [ "## Is instructor the only way to do this?\n", "\n", - "No. Libraries like Marvin, Langchain, and Llamaindex all now leverage the pydantic object in similar ways however they all have different approaches to how they do it. With instructor the goal is to be as light weight as possible, get you as close as possible to the openai api, and then get out of your way.\n", + "No. Libraries like Marvin, Langchain, and Llamaindex all now leverage the Pydantic object in similar ways. The goal is to be as light weight as possible, get you as close as possible to the openai api, and then get out of your way.\n", "\n", "More importantly, we've also added straight forward validation and reasking to the mix.\n", "\n", @@ -496,6 +427,13 @@ "- [Langchain](https://python.langchain.com/docs/modules/model_io/output_parsers/pydantic)\n", "- [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/examples/output_parsing/openai_pydantic_program.html)\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tutorials/2.tips.ipynb b/tutorials/2.tips.ipynb index be5d94f..69bc771 100644 --- a/tutorials/2.tips.ipynb +++ b/tutorials/2.tips.ipynb @@ -32,21 +32,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "fdf5e1d9-31ad-4e8a-a55e-e2e70fff598d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'age': 17, 'name': 'Harry Potter', 'house': }" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import instructor\n", "from openai import OpenAI\n", @@ -88,39 +77,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "c609eb44", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello, I'm Harry Potter, I'm 17 years old and I'm from Gryffindor\n" - ] - } - ], + "outputs": [], "source": [ "resp.say_hello()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "03db160c-81e9-4373-bfec-7a107224b6dd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'age': 17, 'name': 'Harry Potter', 'house': 'Gryffindor'}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "class Character(BaseModel):\n", " age: int\n", @@ -148,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "0e7938b8-4666-4df4-bd80-f53e8baf7550", "metadata": {}, "outputs": [], @@ -193,33 +163,7 @@ "execution_count": null, "id": "69a58d01-ab6f-41b6-bc0c-b0e55fdb6fe4", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'age': 38,\n", - " 'name': 'Severus Snape',\n", - " 'house': 'Slytherin',\n", - " 'properties': [{'index': '1', 'key': 'patronus', 'value': 'Doe'},\n", - " {'index': '2',\n", - " 'key': 'position',\n", - " 'value': 'Potions Master, Defense Against the Dark Arts teacher, Headmaster'},\n", - " {'index': '3',\n", - " 'key': 'loyalty',\n", - " 'value': 'Hogwarts, Albus Dumbledore, Order of the Phoenix, Lily Evans'},\n", - " {'index': '4',\n", - " 'key': 'skills',\n", - " 'value': 'Potions expertise, Occlumency, Legilimency'},\n", - " {'index': '5',\n", - " 'key': 'disguised_loyalty',\n", - " 'value': 'Death Eater (formerly, as a double agent)'}]}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "class Property(BaseModel):\n", " index: str = Field(..., description=\"Monotonically increasing ID\")\n", @@ -260,16 +204,7 @@ "execution_count": null, "id": "1f2a2b14-a956-4f96-90c9-e11ca04ab7d1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "age=38 name='Severus Snape' house='Slytherin'\n", - "age=115 name='Albus Dumbledore' house='Gryffindor'\n" - ] - } - ], + "outputs": [], "source": [ "from typing import Iterable\n", "\n", @@ -282,7 +217,7 @@ "\n", "resp = client.chat.completions.create(\n", " model=\"gpt-4-1106-preview\",\n", - " messages=[{\"role\": \"user\", \"content\": \"Snape and Dumbledore from Harry Potter\"}],\n", + " messages=[{\"role\": \"user\", \"content\": \"Five characters from Harry Potter\"}],\n", " response_model=Iterable[Character],\n", ")\n", "\n", @@ -295,16 +230,7 @@ "execution_count": null, "id": "a3091aba", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "age=38 name='Severus Snape' house='Slytherin'\n", - "age=115 name='Albus Dumbledore' house='Gryffindor'\n" - ] - } - ], + "outputs": [], "source": [ "from typing import Iterable\n", "\n", @@ -317,7 +243,7 @@ "\n", "resp = client.chat.completions.create(\n", " model=\"gpt-4-1106-preview\",\n", - " messages=[{\"role\": \"user\", \"content\": \"Snape and Dumbledore from Harry Potter\"}],\n", + " messages=[{\"role\": \"user\", \"content\": \"Five characters from Harry Potter\"}],\n", " stream=True,\n", " response_model=Iterable[Character],\n", ")\n", @@ -341,29 +267,17 @@ "execution_count": null, "id": "6de8768e-b36a-4a51-9cf9-940d178552f6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id=1 name='Harry Potter' friends=[2, 3, 4, 5]\n", - "id=2 name='Hermione Granger' friends=[1, 3, 4, 5]\n", - "id=3 name='Ron Weasley' friends=[1, 2, 4, 5]\n", - "id=4 name='Draco Malfoy' friends=[5]\n", - "id=5 name='Neville Longbottom' friends=[1, 2, 3, 4]\n" - ] - } - ], + "outputs": [], "source": [ "class Character(BaseModel):\n", " id: int\n", " name: str\n", - " friends: List[int]\n", + " friends_array: List[int] = Field(description=\"Relationships to their friends using the id\")\n", "\n", "\n", "resp = client.chat.completions.create(\n", " model=\"gpt-4-1106-preview\",\n", - " messages=[{\"role\": \"user\", \"content\": \"The 5 kids from Harry Potter\"}],\n", + " messages=[{\"role\": \"user\", \"content\": \"5 kids from Harry Potter\"}],\n", " stream=True,\n", " response_model=Iterable[Character],\n", ")\n", @@ -372,142 +286,6 @@ " print(character)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b31e10d7-ebd2-49b4-b2c4-20dd67ca135d", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "1\n", - "\n", - "Harry Potter\n", - "\n", - "\n", - "\n", - "2\n", - "\n", - "Hermione Granger\n", - "\n", - "\n", - "\n", - "1->2\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "3\n", - "\n", - "Ron Weasley\n", - "\n", - "\n", - "\n", - "1->3\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4\n", - "\n", - "Draco Malfoy\n", - "\n", - "\n", - "\n", - "1->4\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "5\n", - "\n", - "Neville Longbottom\n", - "\n", - "\n", - "\n", - "1->5\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "2->3\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "2->4\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "3->4\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "3->5\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from graphviz import Digraph\n", - "from IPython.display import display\n", - "\n", - "dot = Digraph()\n", - "\n", - "resp = client.chat.completions.create(\n", - " model=\"gpt-4-1106-preview\",\n", - " messages=[{\"role\": \"user\", \"content\": \"The 5 kids from Harry Potter\"}],\n", - " response_model=Iterable[Character],\n", - ")\n", - "\n", - "# Create nodes for each user\n", - "for user in resp:\n", - " dot.node(str(user.id), user.name)\n", - "\n", - "# Create edges for friends\n", - "for user in resp:\n", - " for friend_id in user.friends:\n", - " # To avoid duplicating edges, only add an edge if the friend ID is greater than the user ID\n", - " if friend_id > user.id:\n", - " dot.edge(str(user.id), str(friend_id))\n", - "\n", - "\n", - "# Render the graph to a file\n", - "display(dot)" - ] - }, { "cell_type": "markdown", "id": "523b5797-71a5-4a96-a4b7-21280fb73015", @@ -515,6 +293,108 @@ "source": [ "With the tools we've discussed, we can find numerous real-world applications in production settings. These include extracting action items from transcripts, generating fake data, filling out forms, and creating objects that correspond to generative UI. These simple tricks will be highly useful.\n" ] + }, + { + "cell_type": "markdown", + "id": "a9d20fd9-0cd0-4300-a8c1-d16388969e8e", + "metadata": {}, + "source": [ + "# Missing Data\n", + "\n", + "The Maybe pattern is a concept in functional programming used for error handling. Instead of raising exceptions or returning None, you can use a Maybe type to encapsulate both the result and potential errors.\n", + "\n", + "This pattern is particularly useful when making LLM calls, as providing language models with an escape hatch can effectively reduce hallucinations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c04f44aa-dc4b-4499-a151-e812512e77e6", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "\n", + "class Character(BaseModel):\n", + " age: int\n", + " name: str\n", + "\n", + "class MaybeCharacter(BaseModel):\n", + " result: Optional[Character] = Field(default=None)\n", + " error: bool = Field(default=False)\n", + " message: Optional[str]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "a2155190-e104-4ed6-a17f-e0732499dd51", + "metadata": {}, + "outputs": [], + "source": [ + "def extract(content: str) -> MaybeCharacter:\n", + " return client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " response_model=MaybeCharacter,\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": f\"Extract `{content}`\"},\n", + " ],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "a7b59afa-9bf0-4dc0-a5ca-de584514f33b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MaybeCharacter(result=Character(age=17, name='Harry Potter'), error=False, message=None)" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract(\"Harry Potter\")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "b5ddd5c1-ca75-49a9-95ad-181170435291", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "404 Error", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[66], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m user \u001b[38;5;241m=\u001b[39m extract(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m404 Error\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m user\u001b[38;5;241m.\u001b[39merror:\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(user\u001b[38;5;241m.\u001b[39mmessage)\n", + "\u001b[0;31mValueError\u001b[0m: 404 Error" + ] + } + ], + "source": [ + "user = extract(\"404 Error\")\n", + "\n", + "if user.error:\n", + " raise ValueError(user.message)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e14f7cb-d99c-4696-a1fa-e08319bf5d68", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tutorials/3.0.applications-rag.ipynb b/tutorials/3.0.applications-rag.ipynb index 95dc574..2307db3 100644 --- a/tutorials/3.0.applications-rag.ipynb +++ b/tutorials/3.0.applications-rag.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Applying Structured Output to RAG applications " + "# Applying Structured Output to RAG applications\n" ] }, { @@ -23,7 +23,7 @@ "\n", "**Why is there a need for them?**\n", "\n", - "Pre-trained large language models do not learn over time. If you ask them a question they have not been trained on, they will often hallucinate. Therefore, we need to embed our own data to achieve a better output." + "Pre-trained large language models do not learn over time. If you ask them a question they have not been trained on, they will often hallucinate. Therefore, we need to embed our own data to achieve a better output.\n" ] }, { @@ -38,7 +38,7 @@ "\n", "- **Query-Document Mismatch:** It assumes that the query and document embeddings will align in the vector space, which is often not the case.\n", "- **Text Search Limitations:** The model is restricted to simple text queries without the nuances of advanced search features.\n", - "- **Limited Planning Ability:** It fails to consider additional contextual information that could refine the search results." + "- **Limited Planning Ability:** It fails to consider additional contextual information that could refine the search results.\n" ] }, { @@ -62,7 +62,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Practical Examples" + "## Practical Examples\n" ] }, { @@ -74,11 +74,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "import instructor \n", + "import instructor\n", "\n", "from openai import OpenAI\n", "from typing import List\n", @@ -93,8 +93,8 @@ "source": [ "### Example 1) Improving Extractions\n", "\n", - "One of the big limitations is that often times the query we embed and the text \n", - "A common method of using structured output is to extract information from a document and use it to answer a question. Directly, we can be creative in how we extract, summarize and generate potential questions in order for our embeddings to do better. \n", + "One of the big limitations is that often times the query we embed and the text\n", + "A common method of using structured output is to extract information from a document and use it to answer a question. Directly, we can be creative in how we extract, summarize and generate potential questions in order for our embeddings to do better.\n", "\n", "For example, instead of using just a text chunk we could try to:\n", "\n", @@ -102,7 +102,7 @@ "2. extract hypothetical questions\n", "3. generate a summary of the text\n", "\n", - "In the example below, we use the `instructor` library to extract the key words and themes from a text chunk and use them to answer a question." + "In the example below, we use the `instructor` library to extract the key words and themes from a text chunk and use them to answer a question.\n" ] }, { @@ -113,53 +113,54 @@ "source": [ "class Extraction(BaseModel):\n", " topic: str\n", - " summary: str \n", - " hypothetical_questions: List[str] = Field(default_factory=list, description=\"Hypothetical questions that this document could answer\")\n", - " keywords: List[str] = Field(default_factory=list, description=\"Keywords that this document is about\")" + " summary: str\n", + " hypothetical_questions: List[str] = Field(\n", + " default_factory=list,\n", + " description=\"Hypothetical questions that this document could answer\",\n", + " )\n", + " keywords: List[str] = Field(\n", + " default_factory=list, description=\"Keywords that this document is about\"\n", + " )" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'hypothetical_questions': ['What is the basic concept behind Simple RAG?',\n", - " 'Why might Simple RAG not perform well with '\n", - " 'complex queries?',\n", - " \"In what ways does Simple RAG's embedding search \"\n", - " 'fall short?'],\n", - " 'keywords': ['Simple RAG',\n", - " 'Retriever-Augmented Generation',\n", - " 'user queries',\n", - " 'embedding search',\n", + "{'hypothetical_questions': ['What is the most basic implementation of '\n", + " 'Retrieval-Augmented Generation?',\n", + " 'Why might simple RAG not be adequate for complex '\n", + " 'user queries?'],\n", + " 'keywords': ['Retrieval-Augmented Generation',\n", + " 'RAG',\n", " 'vector database',\n", - " 'query-document mismatch'],\n", - " 'summary': 'Simple RAG is an implementation that embeds user queries for a '\n", - " 'single embedding search in a vector database. Although '\n", - " 'straightforward, it struggles with complex queries and varied '\n", - " 'data sources because of its basic framework and query-document '\n", - " 'mismatch issues.',\n", - " 'topic': 'Simple Retriever-Augmented Generation (RAG)'}\n", - "{'hypothetical_questions': ['What kind of limitations does Simple RAG face?',\n", - " 'How does a monolithic search backend affect '\n", - " \"Simple RAG's performance?\",\n", - " 'Can Simple RAG handle complex, context-specific '\n", - " 'queries effectively?'],\n", + " 'simple implementation'],\n", + " 'summary': 'The simplest form of RAG involves embedding a user query and '\n", + " 'performing a single search in a vector database, such as a '\n", + " 'Wikipedia article store. This method, however, often fails with '\n", + " 'complex queries and diverse data sources.',\n", + " 'topic': 'Simple Retrieval-Augmented Generation (RAG)'}\n", + "{'hypothetical_questions': ['What are the main drawbacks of the simple RAG '\n", + " 'model?',\n", + " 'How does Query-Document Mismatch affect search '\n", + " 'results in simple RAG?',\n", + " 'Why is relying on a monolithic search backend '\n", + " 'problematic for RAG?'],\n", " 'keywords': ['limitations',\n", - " 'Simple RAG',\n", " 'query-document mismatch',\n", " 'monolithic search backend',\n", " 'text search limitations',\n", " 'limited planning ability'],\n", - " 'summary': 'The limitations of Simple RAG include query-document mismatch, '\n", - " 'reliance on a monolithic search backend, restrictions to simple '\n", - " 'text searches, and limited planning ability, which results in '\n", - " 'suboptimal outcomes when handling nuanced or context-specific '\n", - " 'queries.',\n", + " 'summary': 'The limitations of simple RAG include the Query-Document Mismatch '\n", + " 'which assumes a perfect alignment of embeddings, the reliance on '\n", + " 'a Monolithic Search Backend which limits flexibility, Text Search '\n", + " 'Limitations that impede nuanced search, and a Limited Planning '\n", + " 'Ability that overlooks additional context for refining results.',\n", " 'topic': 'Limitations of Simple RAG'}\n" ] } @@ -196,13 +197,14 @@ " model=\"gpt-4-1106-preview\",\n", " stream=True,\n", " response_model=Iterable[Extraction],\n", - " messages=[{\n", - " \"role\": \"system\", \n", - " \"content\": \"Your role is to extract chunks from the following and create a set of topics.\"\n", - " }, {\n", - " \"role\": \"user\", \n", - " \"content\": text_chunk\n", - " }])\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"Your role is to extract chunks from the following and create a set of topics.\",\n", + " },\n", + " {\"role\": \"user\", \"content\": text_chunk},\n", + " ],\n", + ")\n", "\n", "\n", "for extraction in extractions:\n", @@ -213,7 +215,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now you can imagine if you were to embed the summaries, hypothetical questions, and keywords in a vector database, you can then use a vector search to find the best matching document for a given query. What you'll find is that the results are much better than if you were to just embed the text chunk! " + "Now you can imagine if you were to embed the summaries, hypothetical questions, and keywords in a vector database, you can then use a vector search to find the best matching document for a given query. What you'll find is that the results are much better than if you were to just embed the text chunk!\n" ] }, { @@ -227,16 +229,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from datetime import date\n", "\n", + "\n", "class DateRange(BaseModel):\n", " start: date\n", " end: date\n", "\n", + "\n", "class Query(BaseModel):\n", " rewritten_query: str\n", " published_daterange: DateRange" @@ -248,79 +252,72 @@ "source": [ "In this example, `DateRange` and `Query` are Pydantic models that structure the user's query with a date range and a list of domains to search within.\n", "\n", - "These models **restructure** the user's query by including a rewritten query, a range of published dates, and a list of domains to search in." + "These models **restructure** the user's query by including a rewritten query, a range of published dates, and a list of domains to search in.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Using the new restructured query, we can apply this pattern to our function calls to obtain results that are optimized for our backend." + "Using the new restructured query, we can apply this pattern to our function calls to obtain results that are optimized for our backend.\n" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"rewritten_query\": \"recent developments in AI\",\n", - " \"published_daterange\": {\n", - " \"start\": \"2023-01-01\",\n", - " \"end\": \"2023-11-30\"\n", - " }\n", - "}\n" - ] + "data": { + "text/plain": [ + "Query(rewritten_query='recent developments in AI', published_daterange=DateRange(start=datetime.date(2023, 1, 1), end=datetime.date(2023, 12, 22)))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "query = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " response_model=Query,\n", - " messages=[\n", - " {\n", - " \"role\": \"system\", \n", - " \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\"\n", - " },\n", - " {\n", - " \"role\": \"user\", \n", - " \"content\": \"query: What are some recent developments in AI?\"\n", - " }\n", - " ],\n", - ")\n", + "def expand_query(q) -> Query:\n", + " return client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " response_model=Query,\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\",\n", + " },\n", + " {\"role\": \"user\", \"content\": f\"query: {q}\"},\n", + " ],\n", + " )\n", "\n", - "print(query.model_dump_json(indent=4)) # Printing the Json dump of the model" + "\n", + "query = expand_query(\"What are some recent developments in AI?\")\n", + "query" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This isn't just about adding some date ranges. We can even use some chain of thought prompting to generate tailored searches that are deeply integrated with our backend. " + "This isn't just about adding some date ranges. We can even use some chain of thought prompting to generate tailored searches that are deeply integrated with our backend.\n" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"rewritten_query\": \"recent developments in artificial intelligence\",\n", - " \"published_daterange\": {\n", - " \"chain_of_thought\": \"Given that it's currently late 2023, a recent timeframe would ideally be within the last year to ensure the developments are current. Therefore, a suitable date range for recent AI developments could start from late 2022 to the present date in 2023.\",\n", - " \"start\": \"2022-11-18\",\n", - " \"end\": \"2023-11-18\"\n", - " }\n", - "}\n" - ] + "data": { + "text/plain": [ + "Query(rewritten_query='Latest advancements in Artificial Intelligence as of December 2023', published_daterange=DateRange(chain_of_thought=\"To get the most recent developments in AI, the date range should be quite recent. Considering today's date is 2023-12-21, a suitable range might be from the past three months to now.\", start=datetime.date(2023, 9, 21), end=datetime.date(2023, 12, 21)))" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -331,27 +328,302 @@ " start: date\n", " end: date\n", "\n", + "\n", "class Query(BaseModel):\n", - " rewritten_query: str\n", - " published_daterange: DateRange\n", + " rewritten_query: str = Field(\n", + " description=\"Rewrite the query to make it more specific\"\n", + " )\n", + " published_daterange: DateRange = Field(\n", + " description=\"Effective date range to search in\"\n", + " )\n", "\n", "\n", - "query = client.chat.completions.create(\n", - " model=\"gpt-4-1106-preview\",\n", - " response_model=Query,\n", - " messages=[\n", - " {\n", - " \"role\": \"system\", \n", - " \"content\": f\"You're a query understanding system for a search engine. Today is {date.today()}.\"\n", - " },\n", - " {\n", - " \"role\": \"user\", \n", - " \"content\": \"What are some recent developments in AI?\"\n", - " }\n", - " ],\n", + "def expand_query(q) -> Query:\n", + " return client.chat.completions.create(\n", + " model=\"gpt-4-1106-preview\",\n", + " response_model=Query,\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\",\n", + " },\n", + " {\"role\": \"user\", \"content\": f\"query: {q}\"},\n", + " ],\n", + " )\n", + "\n", + "\n", + "expand_query(\"What are some recent developments in AI?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Weights and Biases to track experiments\n", + "\n", + "While running a function like this production is quite simple, a lot of time will be spend on iterating and improving the model. To do this, we can use Weights and Biases to track our experiments.\n", + "\n", + "In order to do so we wand manage a few things\n", + "\n", + "1. Save input and output pairs for later\n", + "2. Save the JSON schema for the response_model\n", + "3. Having snapshots of the model and data allow us to compare results over time, and as we make changes to the model we can see how the results change.\n", + "\n", + "This is particularly useful when we might want to blend a mix of synthetic and real data to evaluate our model. We can use the `wandb` library to track our experiments and save the results to a dashboard.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import json\n", + "import wandb\n", + "from helpers import dicts_to_df\n", + "\n", + "\n", + "class DateRange(BaseModel):\n", + " chain_of_thought: str = Field(\n", + " description=\"Think step by step to plan what is the best time range to search in\"\n", + " )\n", + " start: date\n", + " end: date\n", + "\n", + "\n", + "class Query(BaseModel):\n", + " rewritten_query: str = Field(\n", + " description=\"Rewrite the query to make it more specific\"\n", + " )\n", + " published_daterange: DateRange = Field(\n", + " description=\"Effective date range to search in\"\n", + " )\n", + "\n", + " def report(self):\n", + " dct = self.model_dump()\n", + " dct[\"usage\"] = self._raw_response.usage.model_dump()\n", + " return dct\n", + "\n", + "\n", + "from openai import AsyncOpenAI\n", + "\n", + "# We'll use a different client for async calls\n", + "# To highlight the difference and how we can use both\n", + "aclient = instructor.patch(AsyncOpenAI())\n", + "\n", + "\n", + "async def expand_query(q) -> Query:\n", + " return await aclient.chat.completions.create(\n", + " model=\"gpt-4-1106-preview\",\n", + " response_model=Query,\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\",\n", + " },\n", + " {\"role\": \"user\", \"content\": f\"query: {q}\"},\n", + " ],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "wandb version 0.16.1 is available! To upgrade, please run:\n", + " $ pip install wandb --upgrade" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.16.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /Users/jasonliu/dev/instructor/tutorials/wandb/run-20231222_152028-opuq58lr" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run major-firebrand-21 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/instructor/query-understanding" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/instructor/query-understanding/runs/opuq58lr" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Retrying, exception: 1 validation error for Query\n", + "rewritten_query\n", + " Field required [type=missing, input_value={'rewitten_query': 'recen...', 'end': '2023-12-22'}}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.5/v/missing\n", + "Traceback (most recent call last):\n", + " File \"/Users/jasonliu/dev/instructor/instructor/patch.py\", line 231, in retry_async\n", + " return await process_response_async(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/jasonliu/dev/instructor/instructor/patch.py\", line 201, in process_response_async\n", + " model = await response_model.from_response_async(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/jasonliu/dev/instructor/instructor/function_calls.py\", line 198, in from_response_async\n", + " return cls.model_validate_json(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/jasonliu/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py\", line 532, in model_validate_json\n", + " return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "pydantic_core._pydantic_core.ValidationError: 1 validation error for Query\n", + "rewritten_query\n", + " Field required [type=missing, input_value={'rewitten_query': 'recen...', 'end': '2023-12-22'}}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.5/v/missing\n", + "wandb: WARNING Source type is set to 'repo' but some required information is missing from the environment. A job will not be created from this run. See https://docs.wandb.ai/guides/launch/create-job\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "96b112129c944465a35156a6ffbdfe54", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.001 MB deduped)\\r'), FloatProgress(value=1.0, max…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "W&B sync reduced upload amount by 7.9% " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run major-firebrand-21 at: https://wandb.ai/instructor/query-understanding/runs/opuq58lr
Synced 5 W&B file(s), 1 media file(s), 4 artifact file(s) and 0 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20231222_152028-opuq58lr/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import asyncio\n", + "\n", + "\n", + "run = wandb.init(\n", + " project=\"query-understanding\",\n", ")\n", "\n", - "print(query.model_dump_json(indent=4)) # Printing the Json dump of the model" + "test_queries = [\n", + " \"latest developments in artificial intelligence last 3 weeks\",\n", + " \"renewable energy trends past month\",\n", + " \"quantum computing advancements last 2 months\",\n", + " \"biotechnology updates last 10 days\",\n", + "]\n", + "\n", + "queries = await asyncio.gather(*[expand_query(q) for q in test_queries])\n", + "\n", + "with open(\"schema.json\", \"w+\") as f:\n", + " schema = Query.model_json_schema()\n", + " json.dump(schema, f, indent=2)\n", + "\n", + "with open(\"results.jsonlines\", \"w+\") as f:\n", + " for query in queries:\n", + " f.write(query.model_dump_json() + \"\\n\")\n", + "\n", + "df = dicts_to_df([q.report() for q in queries])\n", + "df[\"input\"] = test_queries\n", + "df.to_csv(\"results.csv\")\n", + "\n", + "run.log({\"results\": wandb.Table(dataframe=df)})\n", + "\n", + "files = wandb.Artifact(\"data\", type=\"dataset\")\n", + "\n", + "files.add_file(\"schema.json\")\n", + "files.add_file(\"results.jsonlines\")\n", + "files.add_file(\"results.csv\")\n", + "\n", + "run.log_artifact(files)\n", + "run.finish()" ] }, { @@ -369,19 +641,21 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from typing import Literal\n", "\n", + "\n", "class SearchClient(BaseModel):\n", - " query: str\n", + " query: str = Field(description=\"The search query that will go into the search bar\")\n", " keywords: List[str]\n", " email: str\n", - " source: Literal[\"gmail\", \"calendar\"] \n", + " source: Literal[\"gmail\", \"calendar\"]\n", " date_range: DateRange\n", "\n", + "\n", "class Retrival(BaseModel):\n", " queries: List[SearchClient]" ] @@ -399,42 +673,21 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"queries\": [\n", - " {\n", - " \"query\": \"schedule\",\n", - " \"keywords\": [\n", - " \"appointments\",\n", - " \"meetings\",\n", - " \"schedule\",\n", - " \"events\"\n", - " ],\n", - " \"email\": \"jason.assistant@busybot.com\",\n", - " \"source\": \"calendar\",\n", - " \"date_range\": {\n", - " \"start\": \"2023-11-18\",\n", - " \"end\": \"2023-11-18\"\n", - " }\n", - " }\n", - " ]\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "retrival = client.chat.completions.create(\n", - " model=\"gpt-4-1106-preview\",\n", + " model=\"gpt-3.5-turbo\",\n", " response_model=Retrival,\n", " messages=[\n", - " {\"role\": \"system\", \"content\":f\"You are Jason's personal assistant. Today is {date.today()}\"},\n", - " {\"role\": \"user\", \"content\": \"What do I have today?\"}\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"\"\"You are Jason's personal assistant.\n", + " He has two emails jason@work.com jason@personal.com \n", + " Today is {date.today()}\"\"\",\n", + " },\n", + " {\"role\": \"user\", \"content\": \"What do I have today?\"},\n", " ],\n", ")\n", "print(retrival.model_dump_json(indent=4))" @@ -444,7 +697,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To make it more challenging, we will assign it multiple tasks, followed by a list of queries that are routed to various search backends, such as email and calendar. Not only do we dispatch to different backends, over which we have no control, but we are also likely to render them to the user in different ways." + "To make it more challenging, we will assign it multiple tasks, followed by a list of queries that are routed to various search backends, such as email and calendar. Not only do we dispatch to different backends, over which we have no control, but we are also likely to render them to the user in different ways.\n" ] }, { @@ -498,8 +751,16 @@ " model=\"gpt-4-1106-preview\",\n", " response_model=Retrival,\n", " messages=[\n", - " {\"role\": \"system\", \"content\": f\"You are Jason's personal assistant. Today is {date.today()}\"},\n", - " {\"role\": \"user\", \"content\": \"What meetings do I have today and are there any important emails I should be aware of?\"}\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"\"\"You are Jason's personal assistant.\n", + " He has two emails jason@work.com jason@personal.com \n", + " Today is {date.today()}\"\"\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"What meetings do I have today and are there any important emails I should be aware of\",\n", + " },\n", " ],\n", ")\n", "print(retrival.model_dump_json(indent=4))" @@ -509,9 +770,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Example 4) Decomposing questions \n", + "### Example 4) Decomposing questions\n", "\n", - "Lastly, a lightly more complex example of a problem that can be solved with structured output is decomposing questions. Where you ultimately want to decompose a question into a series of sub-questions that can be answered by a search backend. For example \n", + "Lastly, a lightly more complex example of a problem that can be solved with structured output is decomposing questions. Where you ultimately want to decompose a question into a series of sub-questions that can be answered by a search backend. For example\n", "\n", "\"Whats the difference in populations of jason's home country and canada?\"\n", "\n", @@ -522,7 +783,7 @@ "3. The population of Canada\n", "4. The difference between the two\n", "\n", - "This would not be done correctly as a single query, nor would it be done in parallel, however there are some opportunities try to be parallel since not all of the sub-questions are dependent on each other." + "This would not be done correctly as a single query, nor would it be done in parallel, however there are some opportunities try to be parallel since not all of the sub-questions are dependent on each other.\n" ] }, { @@ -571,20 +832,31 @@ "class Question(BaseModel):\n", " id: int = Field(..., description=\"A unique identifier for the question\")\n", " query: str = Field(..., description=\"The question decomposited as much as possible\")\n", - " subquestions: List[int] = Field(default_factory=list, description=\"The subquestions that this question is composed of\")\n", + " subquestions: List[int] = Field(\n", + " default_factory=list,\n", + " description=\"The subquestions that this question is composed of\",\n", + " )\n", "\n", "\n", "class QueryPlan(BaseModel):\n", " root_question: str = Field(..., description=\"The root question that the user asked\")\n", - " plan: List[Question] = Field(..., description=\"The plan to answer the root question and its subquestions\")\n", + " plan: List[Question] = Field(\n", + " ..., description=\"The plan to answer the root question and its subquestions\"\n", + " )\n", "\n", "\n", "retrival = client.chat.completions.create(\n", " model=\"gpt-4-1106-preview\",\n", " response_model=QueryPlan,\n", " messages=[\n", - " {\"role\": \"system\", \"content\":\"You are a query understanding system capable of decomposing a question into subquestions.\"},\n", - " {\"role\": \"user\", \"content\": \"What is the difference between the population of jason's home country and canada?\"}\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are a query understanding system capable of decomposing a question into subquestions.\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"What is the difference between the population of jason's home country and canada?\",\n", + " },\n", " ],\n", ")\n", "\n", @@ -595,7 +867,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "I hope in this section I've exposed you to some ways we can be creative in modeling structured outputs to leverage LLMS in building some lightweight components for our systems." + "I hope in this section I've exposed you to some ways we can be creative in modeling structured outputs to leverage LLMS in building some lightweight components for our systems.\n" ] } ], diff --git a/tutorials/helpers.py b/tutorials/helpers.py new file mode 100644 index 0000000..3d7d5fe --- /dev/null +++ b/tutorials/helpers.py @@ -0,0 +1,32 @@ +import pandas as pd + + +def flatten_dict(d, parent_key="", sep="_"): + """ + Flatten a nested dictionary. + + :param d: The nested dictionary to flatten. + :param parent_key: The base key to use for the flattened keys. + :param sep: Separator to use between keys. + :return: A flattened dictionary. + """ + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +def dicts_to_df(list_of_dicts): + """ + Convert a list of dictionaries to a pandas DataFrame. + + :param list_of_dicts: List of dictionaries, potentially nested. + :return: A pandas DataFrame representing the flattened data. + """ + # Flatten each dictionary and create a DataFrame + flattened_data = [flatten_dict(d) for d in list_of_dicts] + return pd.DataFrame(flattened_data) diff --git a/wandb/settings b/wandb/settings new file mode 100644 index 0000000..fe5e2d8 --- /dev/null +++ b/wandb/settings @@ -0,0 +1,5 @@ +[default] +entity = instructor +project = query-understanding +base_url = https://api.wandb.ai +