From 4d19a110c87359e3805da4ed2c8deb1910aec9f2 Mon Sep 17 00:00:00 2001
From: Jason Liu <jxnl@users.noreply.github.com>
Date: Mon, 19 Feb 2024 21:33:49 -0500
Subject: [PATCH] docs(hub): add pandas_df.md and update mkdocs.yml (#451)

---
 docs/hub/pandas_df.md | 140 ++++++++++++++++++++++++++++++++++++++++++
 mkdocs.yml            |   1 +
 2 files changed, 141 insertions(+)
 create mode 100644 docs/hub/pandas_df.md

diff --git a/docs/hub/pandas_df.md b/docs/hub/pandas_df.md
new file mode 100644
index 0000000..7cafbf5
--- /dev/null
+++ b/docs/hub/pandas_df.md
@@ -0,0 +1,140 @@
+# Extracting directly to a DataFrame
+
+In this example we'll show you how to extract directly to a `pandas.DataFrame`
+
+You can pull this example into your IDE by running the following command:
+
+```bash
+instructor hub pull --slug pandas_df --py > pandas_df.py
+```
+
+```python
+from io import StringIO
+from typing import Annotated, Any
+from pydantic import (
+    BaseModel,
+    BeforeValidator,
+    PlainSerializer,
+    InstanceOf,
+    WithJsonSchema,
+)
+import pandas as pd
+import instructor
+import openai
+
+
+def md_to_df(data: Any) -> Any:
+    # Convert markdown to DataFrame
+    if isinstance(data, str):
+        return (
+            pd.read_csv(
+                StringIO(data),  # Process data
+                sep="|",
+                index_col=1,
+            )
+            .dropna(axis=1, how="all")
+            .iloc[1:]
+            .applymap(lambda x: x.strip())
+        )
+    return data
+
+
+MarkdownDataFrame = Annotated[
+    # Validates final type
+    InstanceOf[pd.DataFrame],
+    # Converts markdown to DataFrame
+    BeforeValidator(md_to_df),
+    # Converts DataFrame to markdown on model_dump_json
+    PlainSerializer(lambda df: df.to_markdown()),
+    # Adds a description to the type
+    WithJsonSchema(
+        {
+            "type": "string",
+            "description": """
+            The markdown representation of the table,
+            each one should be tidy, do not try to join
+            tables that should be seperate""",
+        }
+    ),
+]
+
+client = instructor.patch(openai.OpenAI())
+
+
+def extract_df(data: str) -> pd.DataFrame:
+    return client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        response_model=MarkdownDataFrame,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
+            },
+            {
+                "role": "user",
+                "content": f"Extract the data into a table: {data}",
+            },
+        ],
+    )
+
+
+class Table(BaseModel):
+    title: str
+    data: MarkdownDataFrame
+
+
+def extract_table(data: str) -> Table:
+    return client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        response_model=Table,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
+            },
+            {
+                "role": "user",
+                "content": f"Extract the data into a table: {data}",
+            },
+        ],
+    )
+
+
+if __name__ == "__main__":
+    df = extract_df(
+        """Create a table of the last 5 presidents of the United States,
+        including their party and the years they served."""
+    )
+    assert isinstance(df, pd.DataFrame)
+    print(df)
+    """
+                        Party           Years Served
+     President
+    Joe Biden                Democratic          2021-
+    Donald Trump             Republican      2017-2021
+    Barack Obama             Democratic      2009-2017
+    George W. Bush           Republican      2001-2009
+    Bill Clinton             Democratic      1993-2001
+    """
+
+    table = extract_table(
+        """Create a table of the last 5 presidents of the United States,
+        including their party and the years they served."""
+    )
+    assert isinstance(table, Table)
+    assert isinstance(table.data, pd.DataFrame)
+    print(table.title)
+    #> Last 5 Presidents of the United States
+    print(table.data)
+    """
+                         Party    Years Served
+     President
+    Joe Biden          Democrat  2021 - Present
+    Donald Trump     Republican     2017 - 2021
+    Barack Obama       Democrat     2009 - 2017
+    George W. Bush   Republican     2001 - 2009
+    Bill Clinton       Democrat     1993 - 2001
+    """
+```
+
+Notice that you can extract both the raw `MarkdownDataFrame` or a more complex structure like `Table` which includes a title and the data as a DataFrame. You can even request `Iterable[Table]` to get multiple tables in a single response!
diff --git a/mkdocs.yml b/mkdocs.yml
index ed1a333..ea6c6c8 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -168,6 +168,7 @@ nav:
     - Introducing Instructor Hub: 'hub/index.md'
     - Single Classification Model: 'hub/single_classification.md'
     - Multiple Classification Model: 'hub/multiple_classification.md'
+    - Extract Pandas Dataframes: 'hub/pandas_df.md'
     - Ollama: 'hub/ollama.md'
     - Llama CPP: 'hub/llama-cpp-python.md'
     - Together: 'hub/together.md'