diff --git a/tests/openai/evals/test_classification_enums.py b/tests/openai/evals/test_classification_enums.py new file mode 100644 index 0000000..30e9990 --- /dev/null +++ b/tests/openai/evals/test_classification_enums.py @@ -0,0 +1,107 @@ +import enum +from itertools import product +from typing import List + +import pytest +import instructor +from openai import OpenAI + +from pydantic import BaseModel + +class Labels(str, enum.Enum): + SPAM = "spam" + NOT_SPAM = "not_spam" + + +class SinglePrediction(BaseModel): + """ + Correct class label for the given text + """ + + class_label: Labels + +models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] +modes = [instructor.Mode.FUNCTIONS, instructor.Mode.JSON, instructor.Mode.TOOLS] +data = [ + ( + "I am a spammer", + Labels.SPAM, + ), + ( + "I am not a spammer", + Labels.NOT_SPAM, + ), +] + +@pytest.mark.parametrize("model, data, mode", product(models, data, modes)) +def test_classification(model, data, mode): + client = instructor.patch(OpenAI(), mode=mode) + + if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}: + pytest.skip( + "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test" + ) + + input, expected = data + resp = client.chat.completions.create( + model=model, + response_model=SinglePrediction, + messages=[ + { + "role": "user", + "content": f"Classify the following text: {input}", + }, + ], + ) + assert resp.class_label == expected + + +# Define new Enum class for multiple labels +class MultiLabels(str, enum.Enum): + BILLING = "billing" + GENERAL_QUERY = "general_query" + HARDWARE = "hardware" + + +# Adjust the prediction model to accommodate a list of labels +class MultiClassPrediction(BaseModel): + predicted_labels: List[MultiLabels] + +data = [ + ( + "I am having trouble with my billing", + [MultiLabels.BILLING], + ), + ( + "I am having trouble with my hardware", + [MultiLabels.HARDWARE], + ), + ( + "I have a general query and a billing issue", + [MultiLabels.GENERAL_QUERY, MultiLabels.BILLING], + ), +] + +@pytest.mark.parametrize("model, data, mode", product(models, data, modes)) +def test_multi_classify(model, data, mode): + + client = instructor.patch(OpenAI(), mode=mode) + + if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}: + pytest.skip( + "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test" + ) + + input, expected = data + + resp = client.chat.completions.create( + model=model, + response_model=MultiClassPrediction, + messages=[ + { + "role": "user", + "content": f"Classify the following support ticket: {input}", + }, + ], + ) + assert set(resp.predicted_labels) == set(expected) \ No newline at end of file diff --git a/tests/openai/evals/test_classification_literals.py b/tests/openai/evals/test_classification_literals.py new file mode 100644 index 0000000..9382e43 --- /dev/null +++ b/tests/openai/evals/test_classification_literals.py @@ -0,0 +1,97 @@ +import enum +from itertools import product +from typing import List, Literal + +import pytest +import instructor +from openai import AsyncOpenAI, OpenAI + +from pydantic import BaseModel + +class SinglePrediction(BaseModel): + """ + Correct class label for the given text + """ + + class_label: Literal["spam", "not_spam"] + +models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] +modes = [instructor.Mode.FUNCTIONS, instructor.Mode.JSON, instructor.Mode.TOOLS] +data = [ + ( + "I am a spammer", + "spam" + ), + ( + "I am not a spammer", + "not_spam" + ), +] + +@pytest.mark.parametrize("model, data, mode", product(models, data, modes)) +@pytest.mark.asyncio +async def test_classification(model, data, mode): + client = instructor.patch(AsyncOpenAI(), mode=mode) + + if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}: + pytest.skip( + "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test" + ) + + input, expected = data + resp = await client.chat.completions.create( + model=model, + response_model=SinglePrediction, + messages=[ + { + "role": "user", + "content": f"Classify the following text: {input}", + }, + ], + ) + assert resp.class_label == expected + + +# Adjust the prediction model to accommodate a list of labels +class MultiClassPrediction(BaseModel): + predicted_labels: List[Literal["billing", "general_query", "hardware"]] + +data = [ + ( + "I am having trouble with my billing", + ["billing"], + ), + ( + "I am having trouble with my hardware", + ["hardware"], + ), + ( + "I have a general query and a billing issue", + ["general_query", "billing"], + ), +] + +@pytest.mark.parametrize("model, data, mode", product(models, data, modes)) +@pytest.mark.asyncio +async def test_multi_classify(model, data, mode): + + client = instructor.patch(AsyncOpenAI(), mode=mode) + + if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}: + pytest.skip( + "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test" + ) + + input, expected = data + + resp = await client.chat.completions.create( + model=model, + response_model=MultiClassPrediction, + messages=[ + { + "role": "user", + "content": f"Classify the following support ticket: {input}", + }, + ], + ) + assert set(resp.predicted_labels) == set(expected) \ No newline at end of file diff --git a/tests/openai/evals/test_entities.py b/tests/openai/evals/test_entities.py new file mode 100644 index 0000000..7fcd21a --- /dev/null +++ b/tests/openai/evals/test_entities.py @@ -0,0 +1,101 @@ +from itertools import product +from typing import List +from pydantic import BaseModel, Field +import pytest + +import instructor +from openai import OpenAI + + +class Property(BaseModel): + key: str + value: str + resolved_absolute_value: str + + +class Entity(BaseModel): + id: int = Field( + ..., + description="Unique identifier for the entity, used for deduplication, design a scheme allows multiple entities", + ) + subquote_string: List[str] = Field( + ..., + description="Correctly resolved value of the entity, if the entity is a reference to another entity, this should be the id of the referenced entity, include a few more words before and after the value to allow for some context to be used in the resolution", + ) + entity_title: str + properties: List[Property] = Field( + ..., description="List of properties of the entity" + ) + dependencies: List[int] = Field( + ..., + description="List of entity ids that this entity depends or relies on to resolve it", + ) + + +class DocumentExtraction(BaseModel): + entities: List[Entity] = Field( + ..., + description="Body of the answer, each fact should be its seperate object with a body and a list of sources", + ) + + +def ask_ai(content, model, mode, client) -> DocumentExtraction: + resp: DocumentExtraction = client.chat.completions.create( + model="gpt-4", + response_model=DocumentExtraction, + messages=[ + { + "role": "system", + "content": "You are a perfect entity resolution system that extracts facts from the document. Extract and resolve a list of entities from the following document:", + }, + { + "role": "user", + "content": content, + }, + ], + ) # type: ignore + return resp + + +content = """ +Sample Legal Contract +Agreement Contract + +This Agreement is made and entered into on 2020-01-01 by and between Company A ("the Client") and Company B ("the Service Provider"). + +Article 1: Scope of Work + +The Service Provider will deliver the software product to the Client 30 days after the agreement date. + +Article 2: Payment Terms + +The total payment for the service is $50,000. +An initial payment of $10,000 will be made within 7 days of the the signed date. +The final payment will be due 45 days after [SignDate]. + +Article 3: Confidentiality + +The parties agree not to disclose any confidential information received from the other party for 3 months after the final payment date. + +Article 4: Termination + +The contract can be terminated with a 30-day notice, unless there are outstanding obligations that must be fulfilled after the [DeliveryDate]. +""" + + + +models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] +modes = [instructor.Mode.FUNCTIONS, instructor.Mode.JSON, instructor.Mode.TOOLS] + + +@pytest.mark.parametrize("model, mode", product(models, modes)) +def test_extract(model, mode): + client = instructor.patch(OpenAI(), mode=mode) + if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}: + pytest.skip( + "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test" + ) + + # Honestly, if there are no errors, then it's a pass + extract = ask_ai(content, model, mode, client) + assert len(extract.entities) > 0