remove bad examples

2026-06-05 22:50:18 +00:00 · 2024-01-05 15:13:49 -05:00
parent c16d622e88
commit 8893bdd83b
7 changed files with 169 additions and 326 deletions
@@ -1,77 +0,0 @@
-from instructor import OpenAISchema
-from pydantic import Field
-from typing import List, Any
-from openai import OpenAI
-
-client = OpenAI()
-
-
-class RowData(OpenAISchema):
-    row: List[Any] = Field(..., description="The values for each row")
-
-
-class Dataframe(OpenAISchema):
-    """
-    Class representing a dataframe. This class is used to convert
-    data into a frame that can be used by pandas.
-    """
-
-    data: List[RowData] = Field(
-        ...,
-        description="Correct rows of data aligned to column names, Nones are allowed",
-    )
-    columns: List[str] = Field(
-        ...,
-        description="Column names relevant from source data, should be in snake_case",
-    )
-
-    def to_pandas(self):
-        import pandas as pd
-
-        columns = self.columns
-        data = [row.row for row in self.data]
-
-        return pd.DataFrame(data=data, columns=columns)
-
-
-def dataframe(data: str) -> Dataframe:
-    completion = client.chat.completions.create(
-        model="gpt-3.5-turbo-0613",
-        temperature=0.1,
-        functions=[Dataframe.openai_schema],
-        function_call={"name": Dataframe.openai_schema["name"]},
-        messages=[
-            {
-                "role": "system",
-                "content": """Map this data into a dataframe a
-            nd correctly define the correct columns and rows""",
-            },
-            {
-                "role": "user",
-                "content": f"{data}",
-            },
-        ],
-        max_tokens=1000,
-    )
-    return Dataframe.from_response(completion)
-
-
-if __name__ == "__main__":
-    df = dataframe(
-        """My name is John and I am 25 years old. I live in 
-        New York and I like to play basketball. His name is 
-        Mike and he is 30 years old. He lives in San Francisco 
-        and he likes to play baseball. Sarah is 20 years old 
-        and she lives in Los Angeles. She likes to play tennis.
-        Her name is Mary and she is 35 years old. 
-        She lives in Chicago."""
-    )
-
-    print(df.to_pandas())
-    """
-        name  age       location       hobby
-    0   John   25       New York  basketball
-    1   Mike   30  San Francisco    baseball
-    2  Sarah   20    Los Angeles      tennis
-    3   Mary   35        Chicago        None
-    """
@@ -1,97 +0,0 @@
-from instructor import OpenAISchema
-from pydantic import Field
-from typing import List, Any
-from openai import OpenAI
-
-client = OpenAI()
-
-
-class RowData(OpenAISchema):
-    row: List[Any] = Field(..., description="Correct values for each row")
-
-
-class Dataframe(OpenAISchema):
-    name: str = Field(..., description="The name of the dataframe")
-    data: List[RowData] = Field(
-        ...,
-        description="Correct rows of data aligned to column names, Nones are allowed",
-    )
-    columns: List[str] = Field(
-        ...,
-        description="Column names relevant from source data, should be in snake_case",
-    )
-
-    def to_pandas(self):
-        import pandas as pd
-
-        columns = self.columns
-        data = [row.row for row in self.data]
-
-        return pd.DataFrame(data=data, columns=columns)
-
-
-class Database(OpenAISchema):
-    """
-    A set of correct named and defined tables as dataframes
-    Each one should have the right number of columns and correct
-    values for each.
-    """
-
-    tables: List[Dataframe] = Field(
-        ...,
-        description="List of tables in the database",
-    )
-
-
-def dataframe(data: str) -> Database:
-    completion = client.chat.completions.create(
-        model="gpt-4-0613",
-        temperature=0.0,
-        functions=[Database.openai_schema],
-        function_call={"name": Database.openai_schema["name"]},
-        messages=[
-            {
-                "role": "system",
-                "content": """Map this data into a dataframe a
-            nd correctly define the correct columns and rows""",
-            },
-            {
-                "role": "user",
-                "content": f"{data}",
-            },
-        ],
-        max_tokens=1000,
-    )
-    return Database.from_response(completion)
-
-
-if __name__ == "__main__":
-    dfs = dataframe(
-        """My name is John and I am 25 years old. I live in 
-        New York and I like to play basketball. His name is 
-        Mike and he is 30 years old. He lives in San Francisco 
-        and he likes to play baseball. Sarah is 20 years old 
-        and she lives in Los Angeles. She likes to play tennis.
-        Her name is Mary and she is 35 years old. 
-        She lives in Chicago.
-
-        On one team 'Tigers' the captan is John and there are 12 players.
-        On the other team 'Lions' the captan is Mike and there are 10 players.
-        """
-    )
-
-    for df in dfs.tables:
-        print(df.name)
-        print(df.to_pandas())
-    """
-    People
-    ID   Name  Age           City Favorite Sport
-    0   1   John   25       New York     Basketball
-    1   2   Mike   30  San Francisco       Baseball
-    2   3  Sarah   20    Los Angeles         Tennis
-    3   4   Mary   35        Chicago           None
-    Teams
-    ID Team Name Captain  Number of Players
-    0   1    Tigers    John                 12
-    1   2     Lions    Mike                 10
-    """
@@ -0,0 +1,94 @@
+from openai import OpenAI
+from io import StringIO
+from typing import Annotated, Any, Iterable
+from openai import OpenAI
+from pydantic import (
+    BaseModel,
+    BeforeValidator,
+    PlainSerializer,
+    InstanceOf,
+    WithJsonSchema,
+)
+import pandas as pd
+from tomlkit import table
+import instructor
+
+
+client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
+
+
+def md_to_df(data: Any) -> Any:
+    if isinstance(data, str):
+        return (
+            pd.read_csv(
+                StringIO(data),  # Get rid of whitespaces
+                sep="|",
+                index_col=1,
+            )
+            .dropna(axis=1, how="all")
+            .iloc[1:]
+            .map(lambda x: x.strip())
+        )
+    return data
+
+
+MarkdownDataFrame = Annotated[
+    InstanceOf[pd.DataFrame],
+    BeforeValidator(md_to_df),
+    PlainSerializer(lambda x: x.to_markdown()),
+    WithJsonSchema(
+        {
+            "type": "string",
+            "description": """
+                The markdown representation of the table, 
+                each one should be tidy, do not try to join tables
+                that should be seperate""",
+        }
+    ),
+]
+
+
+class Table(BaseModel):
+    caption: str
+    dataframe: MarkdownDataFrame
+
+
+client = instructor.patch(OpenAI())
+
+
+tables = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    response_model=Iterable[Table],
+    messages=[
+        {
+            "role": "system",
+            "content": "Please extract the tables from the following text, merge as much as possible:",
+        },
+        {
+            "role": "user",
+            "content": """
+        My name is John and I am 25 years old. I live in 
+        New York and I like to play basketball. His name is 
+        Mike and he is 30 years old. He lives in San Francisco 
+        and he likes to play baseball. Sarah is 20 years old 
+        and she lives in Los Angeles. She likes to play tennis.
+        Her name is Mary and she is 35 years old. 
+        She lives in Chicago.
+        """,
+        },
+    ],
+)
+
+for table in tables:
+    print(table.caption)
+    print(table.dataframe)
+    print()
+    """
+    People
+            Age           City       Hobby 
+    Name                                   
+    John      25       New York  Basketball
+    Mike      30  San Francisco    Baseball
+    Sarah     20    Los Angeles      Tennis
+    Mary      35        Chicago         N/A
+    """
@@ -1,11 +1,61 @@
 from openai import OpenAI
-
-client = OpenAI()
+from io import StringIO
+from typing import Annotated, Any, Iterable
+from openai import OpenAI
+from pydantic import (
+    BaseModel,
+    BeforeValidator,
+    PlainSerializer,
+    InstanceOf,
+    WithJsonSchema,
+)
+import pandas as pd
+import instructor


-response = client.chat.completions.create(
+client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
+
+
+def md_to_df(data: Any) -> Any:
+    if isinstance(data, str):
+        return (
+            pd.read_csv(
+                StringIO(data),  # Get rid of whitespaces
+                sep="|",
+                index_col=1,
+            )
+            .dropna(axis=1, how="all")
+            .iloc[1:]
+            .map(lambda x: x.strip())
+        )
+    return data
+
+
+MarkdownDataFrame = Annotated[
+    InstanceOf[pd.DataFrame],
+    BeforeValidator(md_to_df),
+    PlainSerializer(lambda x: x.to_markdown()),
+    WithJsonSchema(
+        {
+            "type": "string",
+            "description": """
+                The markdown representation of the table, 
+                each one should be tidy, do not try to join tables
+                that should be seperate""",
+        }
+    ),
+]
+
+
+class Table(BaseModel):
+    caption: str
+    dataframe: MarkdownDataFrame
+
+
+tables = client.chat.completions.create(
    model="gpt-4-vision-preview",
    max_tokens=1000,
+    response_model=Iterable[Table],
    messages=[
        {
            "role": "user",
@@ -38,4 +88,17 @@ response = client.chat.completions.create(
    ],
 )

-print(response.choices[0].message.content)
+for table in tables:
+    print(table.caption)
+    print(table.dataframe)
+    print()
+    """
+    D1 App Retention Rates July 2023 (Ireland & U.K.)
+                    Ireland   UK  
+    Category                       
+    Education             14%   12%
+    Entertainment         13%   11%
+    Games                 26%   25%
+    Social                27%   18%
+    Utilities             11%    9%
+    """