clean up notebooks

This commit is contained in:
Jason Liu
2023-12-22 15:23:52 -05:00
parent c2fc90265f
commit feb2a532ca
4 changed files with 180 additions and 193 deletions
+5
View File
@@ -162,3 +162,8 @@ cython_debug/
examples/citation_with_extraction/fly.toml
my_cache_directory/
tutorials/wandb/*
tutorials/results.csv
tutorials/results.jsonl
tutorials/results.jsonlines
tutorials/schema.json
wandb/settings
+138 -193
View File
@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -107,7 +107,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -125,75 +125,43 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'hypothetical_questions': ['How does a simple RAG Model work?',\n",
" 'What are the key challenges faced by simple RAG '\n",
" 'systems?',\n",
" 'Can a simple RAG model handle complex queries '\n",
" 'effectively?'],\n",
"{'hypothetical_questions': ['What is the most basic implementation of '\n",
" 'Retrieval-Augmented Generation?',\n",
" 'Why might simple RAG not be adequate for complex '\n",
" 'user queries?'],\n",
" 'keywords': ['Retrieval-Augmented Generation',\n",
" 'RAG',\n",
" 'user query',\n",
" 'vector database',\n",
" 'embeddings',\n",
" 'search limitations'],\n",
" 'summary': 'The simple Retrieval-Augmented Generation (RAG) model is an '\n",
" 'approach that uses a vector database to embed and search for user '\n",
" 'queries, such as Wikipedia articles. It provides answers by '\n",
" 'aligning query and document embeddings. However, it has several '\n",
" 'limitations, including query-document mismatch, reliance on a '\n",
" 'monolithic search backend, text search limitations, and a limited '\n",
" 'planning ability.',\n",
" 'topic': 'Simple RAG'}\n",
"{'hypothetical_questions': ['What is a query-document mismatch in the context '\n",
" 'of RAG models?',\n",
" 'Why might a simple RAG model struggle with '\n",
" 'specific user queries?'],\n",
" 'keywords': ['query-document mismatch',\n",
" 'RAG limitation',\n",
" 'embedding alignment'],\n",
" 'summary': \"A limitation where the simple RAG system's query and document \"\n",
" 'embeddings may not align properly, leading to ineffective '\n",
" \"retrieval of information specific to a user's query about, for \"\n",
" \"example, 'climate change effects on marine life'.\",\n",
" 'topic': 'Query-Document Mismatch'}\n",
"{'hypothetical_questions': ['Why is depending on a monolithic search backend a '\n",
" 'limitation for simple RAG?',\n",
" 'How can a monolithic search backend affect the '\n",
" 'quality of search results?'],\n",
" 'keywords': ['monolithic search backend', 'RAG system', 'data sources'],\n",
" 'summary': 'In simple RAG, the reliance on a single search method and backend '\n",
" \"can limit the system's ability to access diverse or specialized \"\n",
" \"data sources, such as when searching for 'latest research in \"\n",
" \"quantum computing'.\",\n",
" 'topic': 'Monolithic Search Backend'}\n",
"{'hypothetical_questions': ['How do text search limitations impact the '\n",
" 'effectiveness of simple RAG?',\n",
" 'Can simple RAG models understand the context of '\n",
" 'search terms?'],\n",
" 'keywords': ['text search limitations', 'RAG', 'advanced search'],\n",
" 'summary': 'The simple RAG model is limited to straightforward text queries '\n",
" 'without advanced search capabilities, failing to resolve nuanced '\n",
" \"queries like 'what problems did we fix last week' due to the \"\n",
" \"presence of generic terms such as 'problem' and 'last week' \"\n",
" 'throughout documents.',\n",
" 'topic': 'Text Search Limitations'}\n",
"{'hypothetical_questions': ['What does limited planning ability imply for a '\n",
" \"RAG model's search results?\",\n",
" 'Can simple RAG models provide context-specific '\n",
" 'information effectively?'],\n",
" 'keywords': ['limited planning ability', 'contextual information', 'RAG'],\n",
" 'summary': 'Simple RAG models struggle to incorporate additional context in '\n",
" 'their searches, which may result in less relevant or overly '\n",
" 'general responses to queries that require specific insights, like '\n",
" \"'Tips for first-time Europe travelers'.\",\n",
" 'topic': 'Limited Planning Ability'}\n"
" 'simple implementation'],\n",
" 'summary': 'The simplest form of RAG involves embedding a user query and '\n",
" 'performing a single search in a vector database, such as a '\n",
" 'Wikipedia article store. This method, however, often fails with '\n",
" 'complex queries and diverse data sources.',\n",
" 'topic': 'Simple Retrieval-Augmented Generation (RAG)'}\n",
"{'hypothetical_questions': ['What are the main drawbacks of the simple RAG '\n",
" 'model?',\n",
" 'How does Query-Document Mismatch affect search '\n",
" 'results in simple RAG?',\n",
" 'Why is relying on a monolithic search backend '\n",
" 'problematic for RAG?'],\n",
" 'keywords': ['limitations',\n",
" 'query-document mismatch',\n",
" 'monolithic search backend',\n",
" 'text search limitations',\n",
" 'limited planning ability'],\n",
" 'summary': 'The limitations of simple RAG include the Query-Document Mismatch '\n",
" 'which assumes a perfect alignment of embeddings, the reliance on '\n",
" 'a Monolithic Search Backend which limits flexibility, Text Search '\n",
" 'Limitations that impede nuanced search, and a Limited Planning '\n",
" 'Ability that overlooks additional context for refining results.',\n",
" 'topic': 'Limitations of Simple RAG'}\n"
]
}
],
@@ -261,7 +229,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -296,16 +264,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Query(rewritten_query='recent developments in AI', published_daterange=DateRange(start=datetime.date(2023, 1, 1), end=datetime.date(2023, 12, 21)))"
"Query(rewritten_query='recent developments in AI', published_daterange=DateRange(start=datetime.date(2023, 1, 1), end=datetime.date(2023, 12, 22)))"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -325,7 +293,8 @@
" )\n",
"\n",
"\n",
"expand_query(\"What are some recent developments in AI?\")"
"query = expand_query(\"What are some recent developments in AI?\")\n",
"query"
]
},
{
@@ -398,55 +367,71 @@
"\n",
"1. Save input and output pairs for later\n",
"2. Save the JSON schema for the response_model\n",
"3. Having snapshots of the model and data allow us to compare results over time, and as we make changes to the model we can see how the results change.\n"
"3. Having snapshots of the model and data allow us to compare results over time, and as we make changes to the model we can see how the results change.\n",
"\n",
"This is particularly useful when we might want to blend a mix of synthetic and real data to evaluate our model. We can use the `wandb` library to track our experiments and save the results to a dashboard.\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"def flatten_dict(d, parent_key=\"\", sep=\"_\"):\n",
" \"\"\"\n",
" Flatten a nested dictionary.\n",
"\n",
" :param d: The nested dictionary to flatten.\n",
" :param parent_key: The base key to use for the flattened keys.\n",
" :param sep: Separator to use between keys.\n",
" :return: A flattened dictionary.\n",
" \"\"\"\n",
" items = []\n",
" for k, v in d.items():\n",
" new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n",
" if isinstance(v, dict):\n",
" items.extend(flatten_dict(v, new_key, sep=sep).items())\n",
" else:\n",
" items.append((new_key, v))\n",
" return dict(items)\n",
"\n",
"\n",
"def dicts_to_df(list_of_dicts):\n",
" \"\"\"\n",
" Convert a list of dictionaries to a pandas DataFrame.\n",
"\n",
" :param list_of_dicts: List of dictionaries, potentially nested.\n",
" :return: A pandas DataFrame representing the flattened data.\n",
" \"\"\"\n",
" # Flatten each dictionary and create a DataFrame\n",
" flattened_data = [flatten_dict(d) for d in list_of_dicts]\n",
" return pd.DataFrame(flattened_data)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import json\n",
"import wandb\n",
"from helpers import dicts_to_df\n",
"\n",
"\n",
"class DateRange(BaseModel):\n",
" chain_of_thought: str = Field(\n",
" description=\"Think step by step to plan what is the best time range to search in\"\n",
" )\n",
" start: date\n",
" end: date\n",
"\n",
"\n",
"class Query(BaseModel):\n",
" rewritten_query: str = Field(\n",
" description=\"Rewrite the query to make it more specific\"\n",
" )\n",
" published_daterange: DateRange = Field(\n",
" description=\"Effective date range to search in\"\n",
" )\n",
"\n",
" def report(self):\n",
" dct = self.model_dump()\n",
" dct[\"usage\"] = self._raw_response.usage.model_dump()\n",
" return dct\n",
"\n",
"\n",
"from openai import AsyncOpenAI\n",
"\n",
"# We'll use a different client for async calls\n",
"# To highlight the difference and how we can use both\n",
"aclient = instructor.patch(AsyncOpenAI())\n",
"\n",
"\n",
"async def expand_query(q) -> Query:\n",
" return await aclient.chat.completions.create(\n",
" model=\"gpt-4-1106-preview\",\n",
" response_model=Query,\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\",\n",
" },\n",
" {\"role\": \"user\", \"content\": f\"query: {q}\"},\n",
" ],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
@@ -476,7 +461,7 @@
{
"data": {
"text/html": [
"Run data is saved locally in <code>/Users/jasonliu/dev/instructor/tutorials/wandb/run-20231221_153734-idscpy5k</code>"
"Run data is saved locally in <code>/Users/jasonliu/dev/instructor/tutorials/wandb/run-20231222_152028-opuq58lr</code>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -488,7 +473,7 @@
{
"data": {
"text/html": [
"Syncing run <strong><a href='https://wandb.ai/instructor/query-understanding/runs/idscpy5k' target=\"_blank\">easy-feather-16</a></strong> to <a href='https://wandb.ai/instructor/query-understanding' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
"Syncing run <strong><a href='https://wandb.ai/instructor/query-understanding/runs/opuq58lr' target=\"_blank\">major-firebrand-21</a></strong> to <a href='https://wandb.ai/instructor/query-understanding' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -512,7 +497,7 @@
{
"data": {
"text/html": [
" View run at <a href='https://wandb.ai/instructor/query-understanding/runs/idscpy5k' target=\"_blank\">https://wandb.ai/instructor/query-understanding/runs/idscpy5k</a>"
" View run at <a href='https://wandb.ai/instructor/query-understanding/runs/opuq58lr' target=\"_blank\">https://wandb.ai/instructor/query-understanding/runs/opuq58lr</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -525,18 +510,39 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Retrying, exception: 1 validation error for Query\n",
"rewritten_query\n",
" Field required [type=missing, input_value={'rewitten_query': 'recen...', 'end': '2023-12-22'}}, input_type=dict]\n",
" For further information visit https://errors.pydantic.dev/2.5/v/missing\n",
"Traceback (most recent call last):\n",
" File \"/Users/jasonliu/dev/instructor/instructor/patch.py\", line 231, in retry_async\n",
" return await process_response_async(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/jasonliu/dev/instructor/instructor/patch.py\", line 201, in process_response_async\n",
" model = await response_model.from_response_async(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/jasonliu/dev/instructor/instructor/function_calls.py\", line 198, in from_response_async\n",
" return cls.model_validate_json(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/jasonliu/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py\", line 532, in model_validate_json\n",
" return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pydantic_core._pydantic_core.ValidationError: 1 validation error for Query\n",
"rewritten_query\n",
" Field required [type=missing, input_value={'rewitten_query': 'recen...', 'end': '2023-12-22'}}, input_type=dict]\n",
" For further information visit https://errors.pydantic.dev/2.5/v/missing\n",
"wandb: WARNING Source type is set to 'repo' but some required information is missing from the environment. A job will not be created from this run. See https://docs.wandb.ai/guides/launch/create-job\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6440cf236ba24c3b839d1256cfada604",
"model_id": "96b112129c944465a35156a6ffbdfe54",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded (0.001 MB deduped)\\r'), FloatProgress(value=1.0, max…"
"VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.001 MB deduped)\\r'), FloatProgress(value=1.0, max…"
]
},
"metadata": {},
@@ -545,7 +551,7 @@
{
"data": {
"text/html": [
"W&B sync reduced upload amount by 9.0% "
"W&B sync reduced upload amount by 7.9% "
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -557,7 +563,7 @@
{
"data": {
"text/html": [
" View run <strong style=\"color:#cdcd00\">easy-feather-16</strong> at: <a href='https://wandb.ai/instructor/query-understanding/runs/idscpy5k' target=\"_blank\">https://wandb.ai/instructor/query-understanding/runs/idscpy5k</a><br/>Synced 4 W&B file(s), 1 media file(s), 4 artifact file(s) and 0 other file(s)"
" View run <strong style=\"color:#cdcd00\">major-firebrand-21</strong> at: <a href='https://wandb.ai/instructor/query-understanding/runs/opuq58lr' target=\"_blank\">https://wandb.ai/instructor/query-understanding/runs/opuq58lr</a><br/>Synced 5 W&B file(s), 1 media file(s), 4 artifact file(s) and 0 other file(s)"
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -569,7 +575,7 @@
{
"data": {
"text/html": [
"Find logs at: <code>./wandb/run-20231221_153734-idscpy5k/logs</code>"
"Find logs at: <code>./wandb/run-20231222_152028-opuq58lr/logs</code>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
@@ -580,39 +586,7 @@
}
],
"source": [
"import json\n",
"import wandb\n",
"\n",
"\n",
"class DateRange(BaseModel):\n",
" chain_of_thought: str = Field(\n",
" description=\"Think step by step to plan what is the best time range to search in\"\n",
" )\n",
" start: date\n",
" end: date\n",
"\n",
"\n",
"class Query(BaseModel):\n",
" rewritten_query: str = Field(\n",
" description=\"Rewrite the query to make it more specific\"\n",
" )\n",
" published_daterange: DateRange = Field(\n",
" description=\"Effective date range to search in\"\n",
" )\n",
"\n",
"\n",
"def expand_query(q) -> Query:\n",
" return client.chat.completions.create(\n",
" model=\"gpt-4-1106-preview\",\n",
" response_model=Query,\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...\",\n",
" },\n",
" {\"role\": \"user\", \"content\": f\"query: {q}\"},\n",
" ],\n",
" )\n",
"import asyncio\n",
"\n",
"\n",
"run = wandb.init(\n",
@@ -626,7 +600,7 @@
" \"biotechnology updates last 10 days\",\n",
"]\n",
"\n",
"queries = [expand_query(q) for q in test_queries]\n",
"queries = await asyncio.gather(*[expand_query(q) for q in test_queries])\n",
"\n",
"with open(\"schema.json\", \"w+\") as f:\n",
" schema = Query.model_json_schema()\n",
@@ -636,7 +610,7 @@
" for query in queries:\n",
" f.write(query.model_dump_json() + \"\\n\")\n",
"\n",
"df = dicts_to_df([q.model_dump() for q in queries])\n",
"df = dicts_to_df([q.report() for q in queries])\n",
"df[\"input\"] = test_queries\n",
"df.to_csv(\"results.csv\")\n",
"\n",
@@ -652,13 +626,6 @@
"run.finish()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
@@ -674,7 +641,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@@ -682,7 +649,7 @@
"\n",
"\n",
"class SearchClient(BaseModel):\n",
" query: str\n",
" query: str = Field(description=\"The search query that will go into the search bar\")\n",
" keywords: List[str]\n",
" email: str\n",
" source: Literal[\"gmail\", \"calendar\"]\n",
@@ -706,43 +673,19 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"queries\": [\n",
" {\n",
" \"query\": \"schedule\",\n",
" \"keywords\": [\n",
" \"appointments\",\n",
" \"meetings\",\n",
" \"schedule\",\n",
" \"events\"\n",
" ],\n",
" \"email\": \"jason.assistant@busybot.com\",\n",
" \"source\": \"calendar\",\n",
" \"date_range\": {\n",
" \"start\": \"2023-11-18\",\n",
" \"end\": \"2023-11-18\"\n",
" }\n",
" }\n",
" ]\n",
"}\n"
]
}
],
"outputs": [],
"source": [
"retrival = client.chat.completions.create(\n",
" model=\"gpt-4-1106-preview\",\n",
" model=\"gpt-3.5-turbo\",\n",
" response_model=Retrival,\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"You are Jason's personal assistant. Today is {date.today()}\",\n",
" \"content\": f\"\"\"You are Jason's personal assistant.\n",
" He has two emails jason@work.com jason@personal.com \n",
" Today is {date.today()}\"\"\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"What do I have today?\"},\n",
" ],\n",
@@ -810,11 +753,13 @@
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"You are Jason's personal assistant. Today is {date.today()}\",\n",
" \"content\": f\"\"\"You are Jason's personal assistant.\n",
" He has two emails jason@work.com jason@personal.com \n",
" Today is {date.today()}\"\"\",\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"What meetings do I have today and are there any important emails I should be aware of?\",\n",
" \"content\": \"What meetings do I have today and are there any important emails I should be aware of\",\n",
" },\n",
" ],\n",
")\n",
+32
View File
@@ -0,0 +1,32 @@
import pandas as pd
def flatten_dict(d, parent_key="", sep="_"):
"""
Flatten a nested dictionary.
:param d: The nested dictionary to flatten.
:param parent_key: The base key to use for the flattened keys.
:param sep: Separator to use between keys.
:return: A flattened dictionary.
"""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def dicts_to_df(list_of_dicts):
"""
Convert a list of dictionaries to a pandas DataFrame.
:param list_of_dicts: List of dictionaries, potentially nested.
:return: A pandas DataFrame representing the flattened data.
"""
# Flatten each dictionary and create a DataFrame
flattened_data = [flatten_dict(d) for d in list_of_dicts]
return pd.DataFrame(flattened_data)
+5
View File
@@ -0,0 +1,5 @@
[default]
entity = instructor
project = query-understanding
base_url = https://api.wandb.ai