mirror of
https://github.com/kennethreitz/instructor.git
synced 2026-06-05 22:50:18 +00:00
122 lines
3.5 KiB
Python
122 lines
3.5 KiB
Python
from openai import OpenAI
|
|
from io import StringIO
|
|
from typing import Annotated, Any, List
|
|
from pydantic import (
|
|
BaseModel,
|
|
BeforeValidator,
|
|
PlainSerializer,
|
|
InstanceOf,
|
|
WithJsonSchema,
|
|
)
|
|
import instructor
|
|
import pandas as pd
|
|
|
|
|
|
client = OpenAI()
|
|
client = instructor.patch(client, mode=instructor.function_calls.Mode.MD_JSON)
|
|
|
|
|
|
def md_to_df(data: Any) -> Any:
|
|
if isinstance(data, str):
|
|
return (
|
|
pd.read_csv(
|
|
StringIO(data), # Get rid of whitespaces
|
|
sep="|",
|
|
index_col=1,
|
|
)
|
|
.dropna(axis=1, how="all")
|
|
.iloc[1:]
|
|
.map(lambda x: x.strip())
|
|
)
|
|
return data
|
|
|
|
|
|
MarkdownDataFrame = Annotated[
|
|
InstanceOf[pd.DataFrame],
|
|
BeforeValidator(md_to_df),
|
|
PlainSerializer(lambda x: x.to_markdown()),
|
|
WithJsonSchema(
|
|
{
|
|
"type": "string",
|
|
"description": """
|
|
The markdown representation of the table,
|
|
each one should be tidy, do not try to join tables
|
|
that should be seperate""",
|
|
}
|
|
),
|
|
]
|
|
|
|
|
|
class Table(BaseModel):
|
|
caption: str
|
|
dataframe: MarkdownDataFrame
|
|
|
|
|
|
class MultipleTables(BaseModel):
|
|
tables: List[Table]
|
|
|
|
|
|
example = MultipleTables(
|
|
tables=[
|
|
Table(
|
|
caption="This is a caption",
|
|
dataframe=pd.DataFrame(
|
|
{
|
|
"Chart A": [10, 40],
|
|
"Chart B": [20, 50],
|
|
"Chart C": [30, 60],
|
|
}
|
|
),
|
|
)
|
|
]
|
|
)
|
|
|
|
|
|
def extract(url: str) -> MultipleTables:
|
|
tables = client.chat.completions.create(
|
|
model="gpt-4-vision-preview",
|
|
max_tokens=4000,
|
|
response_model=MultipleTables,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": f"Describe this data accurately as a table in markdown format. {example.model_dump_json(indent=2)}",
|
|
},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": url},
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": """
|
|
First take a moment to reason about the best set of headers for the tables.
|
|
Write a good h1 for the image above. Then follow up with a short description of the what the data is about.
|
|
Then for each table you identified, write a h2 tag that is a descriptive title of the table.
|
|
Then follow up with a short description of the what the data is about.
|
|
Lastly, produce the markdown table for each table you identified.
|
|
|
|
|
|
Make sure to escape the markdown table properly, and make sure to include the caption and the dataframe.
|
|
including escaping all the newlines and quotes. Only return a markdown table in dataframe, nothing else.
|
|
""",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
return tables.model_dump()
|
|
|
|
|
|
urls = [
|
|
"https://a.storyblok.com/f/47007/2400x1260/f816b031cb/uk-ireland-in-three-charts_chart_a.png/m/2880x0",
|
|
"https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0",
|
|
]
|
|
|
|
|
|
for url in urls:
|
|
tables = extract(url)
|
|
print(tables)
|