mirror of
https://github.com/kennethreitz/instructor.git
synced 2026-06-20 14:30:58 +00:00
141 lines
3.9 KiB
Markdown
141 lines
3.9 KiB
Markdown
# Extracting directly to a DataFrame
|
|
|
|
You can pull this example into your IDE by running the following command:
|
|
|
|
```bash
|
|
instructor hub pull --slug pandas_df --py > pandas_df.py
|
|
```
|
|
|
|
In this example we'll show you how to extract directly to a `pandas.DataFrame`
|
|
|
|
```python
|
|
from io import StringIO
|
|
from typing import Annotated, Any
|
|
from pydantic import (
|
|
BaseModel,
|
|
BeforeValidator,
|
|
PlainSerializer,
|
|
InstanceOf,
|
|
WithJsonSchema,
|
|
)
|
|
import pandas as pd
|
|
import instructor
|
|
import openai
|
|
|
|
|
|
def md_to_df(data: Any) -> Any:
|
|
# Convert markdown to DataFrame
|
|
if isinstance(data, str):
|
|
return (
|
|
pd.read_csv(
|
|
StringIO(data), # Process data
|
|
sep="|",
|
|
index_col=1,
|
|
)
|
|
.dropna(axis=1, how="all")
|
|
.iloc[1:]
|
|
.applymap(lambda x: x.strip())
|
|
)
|
|
return data
|
|
|
|
|
|
MarkdownDataFrame = Annotated[
|
|
# Validates final type
|
|
InstanceOf[pd.DataFrame],
|
|
# Converts markdown to DataFrame
|
|
BeforeValidator(md_to_df),
|
|
# Converts DataFrame to markdown on model_dump_json
|
|
PlainSerializer(lambda df: df.to_markdown()),
|
|
# Adds a description to the type
|
|
WithJsonSchema(
|
|
{
|
|
"type": "string",
|
|
"description": """
|
|
The markdown representation of the table,
|
|
each one should be tidy, do not try to join
|
|
tables that should be seperate""",
|
|
}
|
|
),
|
|
]
|
|
|
|
client = instructor.patch(openai.OpenAI())
|
|
|
|
|
|
def extract_df(data: str) -> pd.DataFrame:
|
|
return client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
response_model=MarkdownDataFrame,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Extract the data into a table: {data}",
|
|
},
|
|
],
|
|
)
|
|
|
|
|
|
class Table(BaseModel):
|
|
title: str
|
|
data: MarkdownDataFrame
|
|
|
|
|
|
def extract_table(data: str) -> Table:
|
|
return client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
response_model=Table,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Extract the data into a table: {data}",
|
|
},
|
|
],
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
df = extract_df(
|
|
"""Create a table of the last 5 presidents of the United States,
|
|
including their party and the years they served."""
|
|
)
|
|
assert isinstance(df, pd.DataFrame)
|
|
print(df)
|
|
"""
|
|
Party Years Served
|
|
President
|
|
Joe Biden Democratic 2021-Current
|
|
Donald Trump Republican 2017-2021
|
|
Barack Obama Democratic 2009-2017
|
|
George W. Bush Republican 2001-2009
|
|
Bill Clinton Democratic 1993-2001
|
|
"""
|
|
|
|
table = extract_table(
|
|
"""Create a table of the last 5 presidents of the United States,
|
|
including their party and the years they served."""
|
|
)
|
|
assert isinstance(table, Table)
|
|
assert isinstance(table.data, pd.DataFrame)
|
|
print(table.title)
|
|
#> Last 5 Presidents of the United States
|
|
print(table.data)
|
|
"""
|
|
Party Years Served
|
|
President
|
|
Joe Biden Democrat 2021 - Present
|
|
Donald Trump Republican 2017 - 2021
|
|
Barack Obama Democrat 2009 - 2017
|
|
George W. Bush Republican 2001 - 2009
|
|
Bill Clinton Democrat 1993 - 2001
|
|
"""
|
|
```
|
|
|
|
Notice that you can extract both the raw `MarkdownDataFrame` or a more complex structure like `Table` which includes a title and the data as a DataFrame. You can even request `Iterable[Table]` to get multiple tables in a single response!
|