From 4d19a110c87359e3805da4ed2c8deb1910aec9f2 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Mon, 19 Feb 2024 21:33:49 -0500 Subject: [PATCH] docs(hub): add pandas_df.md and update mkdocs.yml (#451) --- docs/hub/pandas_df.md | 140 ++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 141 insertions(+) create mode 100644 docs/hub/pandas_df.md diff --git a/docs/hub/pandas_df.md b/docs/hub/pandas_df.md new file mode 100644 index 0000000..7cafbf5 --- /dev/null +++ b/docs/hub/pandas_df.md @@ -0,0 +1,140 @@ +# Extracting directly to a DataFrame + +In this example we'll show you how to extract directly to a `pandas.DataFrame` + +You can pull this example into your IDE by running the following command: + +```bash +instructor hub pull --slug pandas_df --py > pandas_df.py +``` + +```python +from io import StringIO +from typing import Annotated, Any +from pydantic import ( + BaseModel, + BeforeValidator, + PlainSerializer, + InstanceOf, + WithJsonSchema, +) +import pandas as pd +import instructor +import openai + + +def md_to_df(data: Any) -> Any: + # Convert markdown to DataFrame + if isinstance(data, str): + return ( + pd.read_csv( + StringIO(data), # Process data + sep="|", + index_col=1, + ) + .dropna(axis=1, how="all") + .iloc[1:] + .applymap(lambda x: x.strip()) + ) + return data + + +MarkdownDataFrame = Annotated[ + # Validates final type + InstanceOf[pd.DataFrame], + # Converts markdown to DataFrame + BeforeValidator(md_to_df), + # Converts DataFrame to markdown on model_dump_json + PlainSerializer(lambda df: df.to_markdown()), + # Adds a description to the type + WithJsonSchema( + { + "type": "string", + "description": """ + The markdown representation of the table, + each one should be tidy, do not try to join + tables that should be seperate""", + } + ), +] + +client = instructor.patch(openai.OpenAI()) + + +def extract_df(data: str) -> pd.DataFrame: + return client.chat.completions.create( + model="gpt-3.5-turbo", + response_model=MarkdownDataFrame, + messages=[ + { + "role": "system", + "content": "You are a data extraction system, table of writing perfectly formatted markdown tables.", + }, + { + "role": "user", + "content": f"Extract the data into a table: {data}", + }, + ], + ) + + +class Table(BaseModel): + title: str + data: MarkdownDataFrame + + +def extract_table(data: str) -> Table: + return client.chat.completions.create( + model="gpt-3.5-turbo", + response_model=Table, + messages=[ + { + "role": "system", + "content": "You are a data extraction system, table of writing perfectly formatted markdown tables.", + }, + { + "role": "user", + "content": f"Extract the data into a table: {data}", + }, + ], + ) + + +if __name__ == "__main__": + df = extract_df( + """Create a table of the last 5 presidents of the United States, + including their party and the years they served.""" + ) + assert isinstance(df, pd.DataFrame) + print(df) + """ + Party Years Served + President + Joe Biden Democratic 2021- + Donald Trump Republican 2017-2021 + Barack Obama Democratic 2009-2017 + George W. Bush Republican 2001-2009 + Bill Clinton Democratic 1993-2001 + """ + + table = extract_table( + """Create a table of the last 5 presidents of the United States, + including their party and the years they served.""" + ) + assert isinstance(table, Table) + assert isinstance(table.data, pd.DataFrame) + print(table.title) + #> Last 5 Presidents of the United States + print(table.data) + """ + Party Years Served + President + Joe Biden Democrat 2021 - Present + Donald Trump Republican 2017 - 2021 + Barack Obama Democrat 2009 - 2017 + George W. Bush Republican 2001 - 2009 + Bill Clinton Democrat 1993 - 2001 + """ +``` + +Notice that you can extract both the raw `MarkdownDataFrame` or a more complex structure like `Table` which includes a title and the data as a DataFrame. You can even request `Iterable[Table]` to get multiple tables in a single response! diff --git a/mkdocs.yml b/mkdocs.yml index ed1a333..ea6c6c8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -168,6 +168,7 @@ nav: - Introducing Instructor Hub: 'hub/index.md' - Single Classification Model: 'hub/single_classification.md' - Multiple Classification Model: 'hub/multiple_classification.md' + - Extract Pandas Dataframes: 'hub/pandas_df.md' - Ollama: 'hub/ollama.md' - Llama CPP: 'hub/llama-cpp-python.md' - Together: 'hub/together.md'