feat(docs/GPT-4 Vision Model): Add document on extracting tables from images and related functionality (#443)

This commit is contained in:
Jason Liu
2024-02-19 21:46:53 -05:00
committed by GitHub
parent 13962830e3
commit ae9615e106
4 changed files with 178 additions and 0 deletions
+159
View File
@@ -0,0 +1,159 @@
# Extracting Tables from Images with OpenAI's GPT-4 Vision Model
First, we define a custom type, `MarkdownDataFrame`, to handle pandas DataFrames formatted in markdown. This type uses Python's `Annotated` and `InstanceOf` types, along with decorators `BeforeValidator` and `PlainSerializer`, to process and serialize the data.
## Defining the Table Class
The `Table` class is essential for organizing the extracted data. It includes a caption and a dataframe, processed as a markdown table. Since most of the complexity is handled by the `MarkdownDataFrame` type, the `Table` class is straightforward!
This requires additional dependencies `pip install pandas tabulate`.
```python
from openai import OpenAI
from io import StringIO
from typing import Annotated, Any, List
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)
import instructor
import pandas as pd
client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
def md_to_df(data: Any) -> Any:
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Get rid of whitespaces
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.map(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
InstanceOf[pd.DataFrame],
BeforeValidator(md_to_df),
PlainSerializer(lambda x: x.to_markdown()),
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join tables
that should be seperate""",
}
),
]
class Table(BaseModel):
caption: str
dataframe: MarkdownDataFrame
class MultipleTables(BaseModel):
tables: List[Table]
example = MultipleTables(
tables=[
Table(
caption="This is a caption",
dataframe=pd.DataFrame(
{
"Chart A": [10, 40],
"Chart B": [20, 50],
"Chart C": [30, 60],
}
),
)
]
)
def extract(url: str) -> MultipleTables:
tables = client.chat.completions.create(
model="gpt-4-vision-preview",
max_tokens=4000,
response_model=MultipleTables,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Describe this data accurately as a table in markdown format. {example.model_dump_json(indent=2)}",
},
{
"type": "image_url",
"image_url": {"url": url},
},
{
"type": "text",
"text": """
First take a moment to reason about the best set of headers for the tables.
Write a good h1 for the image above. Then follow up with a short description of the what the data is about.
Then for each table you identified, write a h2 tag that is a descriptive title of the table.
Then follow up with a short description of the what the data is about.
Lastly, produce the markdown table for each table you identified.
Make sure to escape the markdown table properly, and make sure to include the caption and the dataframe.
including escaping all the newlines and quotes. Only return a markdown table in dataframe, nothing else.
""",
},
],
}
],
)
return tables
if __name__ == "__main__":
urls = [
"https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0",
]
for url in urls:
tables = extract(url)
for table in tables.tables:
print(table.caption)
#> Top 10 Grossing Android Apps
"""
App Name Category
Rank
1 Google One Productivity
2 Disney+ Entertainment
3 TikTok - Videos, Music & LIVE Entertainment
4 Candy Crush Saga Games
5 Tinder: Dating, Chat & Friends Social networking
6 Coin Master Games
7 Roblox Games
8 Bumble - Dating & Make Friends Dating
9 Royal Match Games
10 Spotify: Music and Podcasts Music & Audio
"""
print(table.dataframe)
"""
App Name Category
Rank
1 Tinder: Dating, Chat & Friends Social networking
2 Disney+ Entertainment
3 YouTube: Watch, Listen, Stream Entertainment
4 Audible: Audio Entertainment Entertainment
5 Candy Crush Saga Games
6 TikTok - Videos, Music & LIVE Entertainment
7 Bumble - Dating & Make Friends Dating
8 Roblox Games
9 LinkedIn: Job Search & News Business
10 Duolingo - Language Lessons Education
"""
```
+2
View File
@@ -168,12 +168,14 @@ nav:
- Introducing Instructor Hub: 'hub/index.md'
- Single Classification Model: 'hub/single_classification.md'
- Multiple Classification Model: 'hub/multiple_classification.md'
- Extracting Tables using GPT-V: 'hub/tables_from_vision.md'
- Extract Pandas Dataframes: 'hub/pandas_df.md'
- Using Ollama: 'hub/ollama.md'
- Using Llama CPP: 'hub/llama-cpp-python.md'
- Using Together Compute: 'hub/together.md'
- Using Anyscale: 'hub/anyscale.md'
- Batch Async Classification w/ Langsmith: 'hub/batch_classification_langsmith.md'
- Tutorials:
- Tutorials (Notebooks): 'tutorials/1-introduction.ipynb'
- Tips and Tricks: 'tutorials/2-tips.ipynb'
Generated
+15
View File
@@ -3107,6 +3107,20 @@ anyio = ">=3.4.0,<5"
[package.extras]
full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"]
[[package]]
name = "tabulate"
version = "0.9.0"
description = "Pretty-print tabular data"
optional = false
python-versions = ">=3.7"
files = [
{file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
{file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
]
[package.extras]
widechars = ["wcwidth"]
[[package]]
name = "tenacity"
version = "8.2.3"
@@ -3445,3 +3459,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "d4eb75440ad5a7100adf34160d56559c168bd63dd424bad8076c5d4aedcc07ed"
+2
View File
@@ -43,6 +43,8 @@ fastapi = "^0.109.2"
redis = "^5.0.1"
diskcache = "^5.6.3"
pandas = "^2.2.0"
tabulate = "^0.9.0"
[build-system]
requires = ["poetry-core"]