remove bad examples

This commit is contained in:
Jason Liu
2024-01-05 15:13:49 -05:00
parent c16d622e88
commit 8893bdd83b
7 changed files with 169 additions and 326 deletions
@@ -1,77 +0,0 @@
from instructor import OpenAISchema
from pydantic import Field
from typing import List, Any
from openai import OpenAI
client = OpenAI()
class RowData(OpenAISchema):
row: List[Any] = Field(..., description="The values for each row")
class Dataframe(OpenAISchema):
"""
Class representing a dataframe. This class is used to convert
data into a frame that can be used by pandas.
"""
data: List[RowData] = Field(
...,
description="Correct rows of data aligned to column names, Nones are allowed",
)
columns: List[str] = Field(
...,
description="Column names relevant from source data, should be in snake_case",
)
def to_pandas(self):
import pandas as pd
columns = self.columns
data = [row.row for row in self.data]
return pd.DataFrame(data=data, columns=columns)
def dataframe(data: str) -> Dataframe:
completion = client.chat.completions.create(
model="gpt-3.5-turbo-0613",
temperature=0.1,
functions=[Dataframe.openai_schema],
function_call={"name": Dataframe.openai_schema["name"]},
messages=[
{
"role": "system",
"content": """Map this data into a dataframe a
nd correctly define the correct columns and rows""",
},
{
"role": "user",
"content": f"{data}",
},
],
max_tokens=1000,
)
return Dataframe.from_response(completion)
if __name__ == "__main__":
df = dataframe(
"""My name is John and I am 25 years old. I live in
New York and I like to play basketball. His name is
Mike and he is 30 years old. He lives in San Francisco
and he likes to play baseball. Sarah is 20 years old
and she lives in Los Angeles. She likes to play tennis.
Her name is Mary and she is 35 years old.
She lives in Chicago."""
)
print(df.to_pandas())
"""
name age location hobby
0 John 25 New York basketball
1 Mike 30 San Francisco baseball
2 Sarah 20 Los Angeles tennis
3 Mary 35 Chicago None
"""
@@ -1,97 +0,0 @@
from instructor import OpenAISchema
from pydantic import Field
from typing import List, Any
from openai import OpenAI
client = OpenAI()
class RowData(OpenAISchema):
row: List[Any] = Field(..., description="Correct values for each row")
class Dataframe(OpenAISchema):
name: str = Field(..., description="The name of the dataframe")
data: List[RowData] = Field(
...,
description="Correct rows of data aligned to column names, Nones are allowed",
)
columns: List[str] = Field(
...,
description="Column names relevant from source data, should be in snake_case",
)
def to_pandas(self):
import pandas as pd
columns = self.columns
data = [row.row for row in self.data]
return pd.DataFrame(data=data, columns=columns)
class Database(OpenAISchema):
"""
A set of correct named and defined tables as dataframes
Each one should have the right number of columns and correct
values for each.
"""
tables: List[Dataframe] = Field(
...,
description="List of tables in the database",
)
def dataframe(data: str) -> Database:
completion = client.chat.completions.create(
model="gpt-4-0613",
temperature=0.0,
functions=[Database.openai_schema],
function_call={"name": Database.openai_schema["name"]},
messages=[
{
"role": "system",
"content": """Map this data into a dataframe a
nd correctly define the correct columns and rows""",
},
{
"role": "user",
"content": f"{data}",
},
],
max_tokens=1000,
)
return Database.from_response(completion)
if __name__ == "__main__":
dfs = dataframe(
"""My name is John and I am 25 years old. I live in
New York and I like to play basketball. His name is
Mike and he is 30 years old. He lives in San Francisco
and he likes to play baseball. Sarah is 20 years old
and she lives in Los Angeles. She likes to play tennis.
Her name is Mary and she is 35 years old.
She lives in Chicago.
On one team 'Tigers' the captan is John and there are 12 players.
On the other team 'Lions' the captan is Mike and there are 10 players.
"""
)
for df in dfs.tables:
print(df.name)
print(df.to_pandas())
"""
People
ID Name Age City Favorite Sport
0 1 John 25 New York Basketball
1 2 Mike 30 San Francisco Baseball
2 3 Sarah 20 Los Angeles Tennis
3 4 Mary 35 Chicago None
Teams
ID Team Name Captain Number of Players
0 1 Tigers John 12
1 2 Lions Mike 10
"""
+94
View File
@@ -0,0 +1,94 @@
from openai import OpenAI
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)
import pandas as pd
from tomlkit import table
import instructor
client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
def md_to_df(data: Any) -> Any:
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Get rid of whitespaces
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.map(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
InstanceOf[pd.DataFrame],
BeforeValidator(md_to_df),
PlainSerializer(lambda x: x.to_markdown()),
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join tables
that should be seperate""",
}
),
]
class Table(BaseModel):
caption: str
dataframe: MarkdownDataFrame
client = instructor.patch(OpenAI())
tables = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Iterable[Table],
messages=[
{
"role": "system",
"content": "Please extract the tables from the following text, merge as much as possible:",
},
{
"role": "user",
"content": """
My name is John and I am 25 years old. I live in
New York and I like to play basketball. His name is
Mike and he is 30 years old. He lives in San Francisco
and he likes to play baseball. Sarah is 20 years old
and she lives in Los Angeles. She likes to play tennis.
Her name is Mary and she is 35 years old.
She lives in Chicago.
""",
},
],
)
for table in tables:
print(table.caption)
print(table.dataframe)
print()
"""
People
Age City Hobby
Name
John 25 New York Basketball
Mike 30 San Francisco Baseball
Sarah 20 Los Angeles Tennis
Mary 35 Chicago N/A
"""
@@ -1,11 +1,61 @@
from openai import OpenAI
client = OpenAI()
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)
import pandas as pd
import instructor
response = client.chat.completions.create(
client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
def md_to_df(data: Any) -> Any:
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Get rid of whitespaces
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.map(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
InstanceOf[pd.DataFrame],
BeforeValidator(md_to_df),
PlainSerializer(lambda x: x.to_markdown()),
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join tables
that should be seperate""",
}
),
]
class Table(BaseModel):
caption: str
dataframe: MarkdownDataFrame
tables = client.chat.completions.create(
model="gpt-4-vision-preview",
max_tokens=1000,
response_model=Iterable[Table],
messages=[
{
"role": "user",
@@ -38,4 +88,17 @@ response = client.chat.completions.create(
],
)
print(response.choices[0].message.content)
for table in tables:
print(table.caption)
print(table.dataframe)
print()
"""
D1 App Retention Rates July 2023 (Ireland & U.K.)
Ireland UK
Category
Education 14% 12%
Entertainment 13% 11%
Games 26% 25%
Social 27% 18%
Utilities 11% 9%
"""