fix: black and synthetic examples

2026-06-05 22:50:18 +00:00 · 2024-03-04 15:27:52 -05:00
parent ed2a70254a
commit edc5ef684b
7 changed files with 155 additions and 33 deletions
@@ -0,0 +1,63 @@
+# How should I include examples? 
+
+To enhance the clarity and usability of your model and prompt, incorporating examples directly into the JSON schema extra of your Pydantic model is highly recommended. This approach not only streamlines the integration of practical examples but also ensures that they are easily accessible and understandable within the context of your model's schema.
+
+
+```python
+import openai
+import instructor
+from typing import Iterable
+from pydantic import BaseModel, Field, ConfigDict
+
+client = instructor.patch(openai.OpenAI())
+
+
+class SyntheticQA(BaseModel):
+    question: str
+    answer: str
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {"question": "What is the capital of France?", "answer": "Paris"},
+                {
+                    "question": "What is the largest planet in our solar system?",
+                    "answer": "Jupiter",
+                },
+                {
+                    "question": "Who wrote 'To Kill a Mockingbird'?",
+                    "answer": "Harper Lee",
+                },
+                {
+                    "question": "What element does 'O' represent on the periodic table?",
+                    "answer": "Oxygen",
+                },
+            ]
+        }
+    )
+
+
+def get_synthetic_data() -> Iterable[SyntheticQA]:
+    return client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "Generate synthetic examples"},
+            {
+                "role": "user",
+                "content": "Generate the exact examples you see in the examples of this prompt. ",
+            },
+        ],
+        response_model=Iterable[SyntheticQA],
+    )  # type: ignore
+
+
+if __name__ == "__main__":
+    for example in get_synthetic_data():
+        print(example)
+        """
+        question='What is the capital of France?' answer='Paris'
+        question='What is the largest planet in our solar system?' answer='Jupiter'
+        question="Who wrote 'To Kill a Mockingbird'?" answer='Harper Lee'
+        question="What element does 'O' represent on the periodic table?" answer='Oxygen'
+        """
+```
@@ -23,4 +23,4 @@ resp = patched_chat(
        },
    ],
 )
-print(resp)
+print(resp)
@@ -0,0 +1,56 @@
+import openai
+import instructor
+from typing import Iterable
+from pydantic import BaseModel, ConfigDict
+
+client = instructor.patch(openai.OpenAI())
+
+
+class SyntheticQA(BaseModel):
+    question: str
+    answer: str
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {"question": "What is the capital of France?", "answer": "Paris"},
+                {
+                    "question": "What is the largest planet in our solar system?",
+                    "answer": "Jupiter",
+                },
+                {
+                    "question": "Who wrote 'To Kill a Mockingbird'?",
+                    "answer": "Harper Lee",
+                },
+                {
+                    "question": "What element does 'O' represent on the periodic table?",
+                    "answer": "Oxygen",
+                },
+            ]
+        }
+    )
+
+
+def get_synthetic_data() -> Iterable[SyntheticQA]:
+    return client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "Generate synthetic examples"},
+            {
+                "role": "user",
+                "content": "Generate the exact examples you see in the examples of this prompt. ",
+            },
+        ],
+        response_model=Iterable[SyntheticQA],
+    )  # type: ignore
+
+
+if __name__ == "__main__":
+    for example in get_synthetic_data():
+        print(example)
+        """
+        question='What is the capital of France?' answer='Paris'
+        question='What is the largest planet in our solar system?' answer='Jupiter'
+        question="Who wrote 'To Kill a Mockingbird'?" answer='Harper Lee'
+        question="What element does 'O' represent on the periodic table?" answer='Oxygen'
+        """
@@ -12,13 +12,14 @@ import instructor

 load_dotenv(find_dotenv())

-IMAGE_FILE = "image-file.txt" # file with all the images to be processed
+IMAGE_FILE = "image-file.txt"  # file with all the images to be processed

 # Add logger
 logging.basicConfig()
 logger = logging.getLogger("app")
 logger.setLevel("INFO")

+
 class Competitor(BaseModel):
    name: str
    features: Optional[List[str]]
@@ -30,13 +31,12 @@ class Industry(BaseModel):
    Represents competitors from a specific industry extracted from an image using AI.
    """

-    name: str = Field(
-        description="The name of the industry"
-    )
+    name: str = Field(description="The name of the industry")
    competitor_list: List[Competitor] = Field(
        description="A list of competitors for this industry"
    )

+
 class Competition(BaseModel):
    """
    Represents competitors extracted from an image using AI.
@@ -49,10 +49,10 @@ class Competition(BaseModel):
        description="A list of industries and their competitors"
    )

+
 # Define clients
-client_image = instructor.patch(
-    OpenAI(), mode=instructor.Mode.MD_JSON
-)
+client_image = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)
+

 # Define functions
 def read_images(image_urls: List[str]) -> Competition:
@@ -85,7 +85,6 @@ def read_images(image_urls: List[str]) -> Competition:
    )


-
 def process_and_identify_competitors():
    """
    Main function to process the image list file and identify competitors.
@@ -121,6 +120,7 @@ def process_and_identify_competitors():
            indent=4,
        )

+
 if __name__ == "__main__":
    process_and_identify_competitors()

@@ -53,27 +53,27 @@ MODEL_COSTS: Dict[
    ModelNames,
    Union[Dict[str, float], float],
 ] = {
-      "gpt-4-0125-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000}, 
-      "gpt-4-turbo-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000}, 
-      "gpt-4-1106-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000}, 
-      "gpt-4-vision-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000}, 
-      "gpt-4": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000}, 
-      "gpt-4-0314": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000},
-      "gpt-4-0613": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000}, 
-      "gpt-4-32k": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000}, 
-      "gpt-4-32k-0314": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000}, 
-      "gpt-4-32k-0613": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000}, 
-      "gpt-3.5-turbo": {"prompt": 0.0005 / 1000, "completion": 0.0015 / 1000}, 
-      "gpt-3.5-turbo-16k": {"prompt": 0.0030 / 1000, "completion": 0.0040 / 1000}, 
-      "gpt-3.5-turbo-0301": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000}, 
-      "gpt-3.5-turbo-0613": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000}, 
-      "gpt-3.5-turbo-1106": {"prompt": 0.0010 / 1000, "completion": 0.0020 / 1000}, 
-      "gpt-3.5-turbo-0125": {"prompt": 0.0005 / 1000, "completion": 0.0015 / 1000}, 
-      "gpt-3.5-turbo-16k-0613": {"prompt": 0.0030 / 1000, "completion": 0.0040 / 1000}, 
-      "gpt-3.5-turbo-instruct": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000}, 
-      "text-embedding-3-small": 0.00002 / 1000,
-      "text-embedding-3-large": 0.00013 / 1000,
-      "text-embedding-ada-002": 0.00010 / 1000,
+    "gpt-4-0125-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000},
+    "gpt-4-turbo-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000},
+    "gpt-4-1106-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000},
+    "gpt-4-vision-preview": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000},
+    "gpt-4": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000},
+    "gpt-4-0314": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000},
+    "gpt-4-0613": {"prompt": 0.03 / 1000, "completion": 0.06 / 1000},
+    "gpt-4-32k": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000},
+    "gpt-4-32k-0314": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000},
+    "gpt-4-32k-0613": {"prompt": 0.06 / 1000, "completion": 0.12 / 1000},
+    "gpt-3.5-turbo": {"prompt": 0.0005 / 1000, "completion": 0.0015 / 1000},
+    "gpt-3.5-turbo-16k": {"prompt": 0.0030 / 1000, "completion": 0.0040 / 1000},
+    "gpt-3.5-turbo-0301": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000},
+    "gpt-3.5-turbo-0613": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000},
+    "gpt-3.5-turbo-1106": {"prompt": 0.0010 / 1000, "completion": 0.0020 / 1000},
+    "gpt-3.5-turbo-0125": {"prompt": 0.0005 / 1000, "completion": 0.0015 / 1000},
+    "gpt-3.5-turbo-16k-0613": {"prompt": 0.0030 / 1000, "completion": 0.0040 / 1000},
+    "gpt-3.5-turbo-instruct": {"prompt": 0.0015 / 1000, "completion": 0.0020 / 1000},
+    "text-embedding-3-small": 0.00002 / 1000,
+    "text-embedding-3-large": 0.00013 / 1000,
+    "text-embedding-ada-002": 0.00010 / 1000,
 }


@@ -83,7 +83,7 @@ def get_model_cost(
    """Get the cost details for a given model."""
    if model in MODEL_COSTS:
        return MODEL_COSTS[model]
-    
+
    if model.startswith("gpt-3.5-turbo-16k"):
        return MODEL_COSTS["gpt-3.5-turbo-16k"]
    elif model.startswith("gpt-3.5-turbo"):
@@ -134,11 +134,13 @@ def handle_response_model(
        elif mode in {Mode.JSON, Mode.MD_JSON, Mode.JSON_SCHEMA}:
            # If its a JSON Mode we need to massage the prompt a bit
            # in order to get the response we want in a json format
-            message = dedent(f"""
+            message = dedent(
+                f"""
                As a genius expert, your task is to understand the content and provide
                the parsed objects in json that match the following json_schema:\n
                {response_model.model_json_schema()['properties']}
-                """)
+                """
+            )
            # Check for nested models
            if "$defs" in response_model.model_json_schema():
                message += f"\nHere are some more definitions to adhere too:\n{response_model.model_json_schema()['$defs']}"
@@ -163,6 +163,7 @@ nav:
    - Image to Ad Copy: 'examples/image_to_ad_copy.md'
    - Ollama: 'examples/ollama.md'
    - SQLModel Integration: 'examples/sqlmodel.md'
+    - Including Examples in Prompt: 'examples/examples.md'
  - Hub:
    - Introducing Instructor Hub: 'hub/index.md'
    - Single Classification Model: 'hub/single_classification.md'