docs(hub): add pandas_df.md and update mkdocs.yml (#451)

This commit is contained in:
Jason Liu
2024-02-19 21:33:49 -05:00
committed by GitHub
parent 23897685f5
commit 4d19a110c8
2 changed files with 141 additions and 0 deletions
+140
View File
@@ -0,0 +1,140 @@
# Extracting directly to a DataFrame
In this example we'll show you how to extract directly to a `pandas.DataFrame`
You can pull this example into your IDE by running the following command:
```bash
instructor hub pull --slug pandas_df --py > pandas_df.py
```
```python
from io import StringIO
from typing import Annotated, Any
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)
import pandas as pd
import instructor
import openai
def md_to_df(data: Any) -> Any:
# Convert markdown to DataFrame
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Process data
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.applymap(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
# Validates final type
InstanceOf[pd.DataFrame],
# Converts markdown to DataFrame
BeforeValidator(md_to_df),
# Converts DataFrame to markdown on model_dump_json
PlainSerializer(lambda df: df.to_markdown()),
# Adds a description to the type
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join
tables that should be seperate""",
}
),
]
client = instructor.patch(openai.OpenAI())
def extract_df(data: str) -> pd.DataFrame:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=MarkdownDataFrame,
messages=[
{
"role": "system",
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
},
{
"role": "user",
"content": f"Extract the data into a table: {data}",
},
],
)
class Table(BaseModel):
title: str
data: MarkdownDataFrame
def extract_table(data: str) -> Table:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Table,
messages=[
{
"role": "system",
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
},
{
"role": "user",
"content": f"Extract the data into a table: {data}",
},
],
)
if __name__ == "__main__":
df = extract_df(
"""Create a table of the last 5 presidents of the United States,
including their party and the years they served."""
)
assert isinstance(df, pd.DataFrame)
print(df)
"""
Party Years Served
President
Joe Biden Democratic 2021-
Donald Trump Republican 2017-2021
Barack Obama Democratic 2009-2017
George W. Bush Republican 2001-2009
Bill Clinton Democratic 1993-2001
"""
table = extract_table(
"""Create a table of the last 5 presidents of the United States,
including their party and the years they served."""
)
assert isinstance(table, Table)
assert isinstance(table.data, pd.DataFrame)
print(table.title)
#> Last 5 Presidents of the United States
print(table.data)
"""
Party Years Served
President
Joe Biden Democrat 2021 - Present
Donald Trump Republican 2017 - 2021
Barack Obama Democrat 2009 - 2017
George W. Bush Republican 2001 - 2009
Bill Clinton Democrat 1993 - 2001
"""
```
Notice that you can extract both the raw `MarkdownDataFrame` or a more complex structure like `Table` which includes a title and the data as a DataFrame. You can even request `Iterable[Table]` to get multiple tables in a single response!
+1
View File
@@ -168,6 +168,7 @@ nav:
- Introducing Instructor Hub: 'hub/index.md'
- Single Classification Model: 'hub/single_classification.md'
- Multiple Classification Model: 'hub/multiple_classification.md'
- Extract Pandas Dataframes: 'hub/pandas_df.md'
- Ollama: 'hub/ollama.md'
- Llama CPP: 'hub/llama-cpp-python.md'
- Together: 'hub/together.md'