mirror of
https://github.com/kennethreitz/instructor.git
synced 2026-06-05 22:50:18 +00:00
docs(hub): add pandas_df.md and update mkdocs.yml (#451)
This commit is contained in:
@@ -0,0 +1,140 @@
|
||||
# Extracting directly to a DataFrame
|
||||
|
||||
In this example we'll show you how to extract directly to a `pandas.DataFrame`
|
||||
|
||||
You can pull this example into your IDE by running the following command:
|
||||
|
||||
```bash
|
||||
instructor hub pull --slug pandas_df --py > pandas_df.py
|
||||
```
|
||||
|
||||
```python
|
||||
from io import StringIO
|
||||
from typing import Annotated, Any
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
BeforeValidator,
|
||||
PlainSerializer,
|
||||
InstanceOf,
|
||||
WithJsonSchema,
|
||||
)
|
||||
import pandas as pd
|
||||
import instructor
|
||||
import openai
|
||||
|
||||
|
||||
def md_to_df(data: Any) -> Any:
|
||||
# Convert markdown to DataFrame
|
||||
if isinstance(data, str):
|
||||
return (
|
||||
pd.read_csv(
|
||||
StringIO(data), # Process data
|
||||
sep="|",
|
||||
index_col=1,
|
||||
)
|
||||
.dropna(axis=1, how="all")
|
||||
.iloc[1:]
|
||||
.applymap(lambda x: x.strip())
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
MarkdownDataFrame = Annotated[
|
||||
# Validates final type
|
||||
InstanceOf[pd.DataFrame],
|
||||
# Converts markdown to DataFrame
|
||||
BeforeValidator(md_to_df),
|
||||
# Converts DataFrame to markdown on model_dump_json
|
||||
PlainSerializer(lambda df: df.to_markdown()),
|
||||
# Adds a description to the type
|
||||
WithJsonSchema(
|
||||
{
|
||||
"type": "string",
|
||||
"description": """
|
||||
The markdown representation of the table,
|
||||
each one should be tidy, do not try to join
|
||||
tables that should be seperate""",
|
||||
}
|
||||
),
|
||||
]
|
||||
|
||||
client = instructor.patch(openai.OpenAI())
|
||||
|
||||
|
||||
def extract_df(data: str) -> pd.DataFrame:
|
||||
return client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
response_model=MarkdownDataFrame,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Extract the data into a table: {data}",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class Table(BaseModel):
|
||||
title: str
|
||||
data: MarkdownDataFrame
|
||||
|
||||
|
||||
def extract_table(data: str) -> Table:
|
||||
return client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
response_model=Table,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Extract the data into a table: {data}",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
df = extract_df(
|
||||
"""Create a table of the last 5 presidents of the United States,
|
||||
including their party and the years they served."""
|
||||
)
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
print(df)
|
||||
"""
|
||||
Party Years Served
|
||||
President
|
||||
Joe Biden Democratic 2021-
|
||||
Donald Trump Republican 2017-2021
|
||||
Barack Obama Democratic 2009-2017
|
||||
George W. Bush Republican 2001-2009
|
||||
Bill Clinton Democratic 1993-2001
|
||||
"""
|
||||
|
||||
table = extract_table(
|
||||
"""Create a table of the last 5 presidents of the United States,
|
||||
including their party and the years they served."""
|
||||
)
|
||||
assert isinstance(table, Table)
|
||||
assert isinstance(table.data, pd.DataFrame)
|
||||
print(table.title)
|
||||
#> Last 5 Presidents of the United States
|
||||
print(table.data)
|
||||
"""
|
||||
Party Years Served
|
||||
President
|
||||
Joe Biden Democrat 2021 - Present
|
||||
Donald Trump Republican 2017 - 2021
|
||||
Barack Obama Democrat 2009 - 2017
|
||||
George W. Bush Republican 2001 - 2009
|
||||
Bill Clinton Democrat 1993 - 2001
|
||||
"""
|
||||
```
|
||||
|
||||
Notice that you can extract both the raw `MarkdownDataFrame` or a more complex structure like `Table` which includes a title and the data as a DataFrame. You can even request `Iterable[Table]` to get multiple tables in a single response!
|
||||
@@ -168,6 +168,7 @@ nav:
|
||||
- Introducing Instructor Hub: 'hub/index.md'
|
||||
- Single Classification Model: 'hub/single_classification.md'
|
||||
- Multiple Classification Model: 'hub/multiple_classification.md'
|
||||
- Extract Pandas Dataframes: 'hub/pandas_df.md'
|
||||
- Ollama: 'hub/ollama.md'
|
||||
- Llama CPP: 'hub/llama-cpp-python.md'
|
||||
- Together: 'hub/together.md'
|
||||
|
||||
Reference in New Issue
Block a user