instructor/examples/validated-multiclass/run.py

from typing import List
from pydantic import BaseModel, ValidationInfo, model_validator
import openai
import instructor
import asyncio

client = instructor.patch(
    openai.AsyncOpenAI(),
)


class Tag(BaseModel):
    id: int
    name: str

    @model_validator(mode="after")
    def validate_ids(self, info: ValidationInfo):
        context = info.context
        if context:
            tags: List[Tag] = context.get("tags")
            assert self.id in {
                tag.id for tag in tags
            }, f"Tag ID {self.id} not found in context"
            assert self.name in {
                tag.name for tag in tags
            }, f"Tag name {self.name} not found in context"
        return self


class TagWithInstructions(Tag):
    instructions: str


class TagRequest(BaseModel):
    texts: List[str]
    tags: List[TagWithInstructions]


class TagResponse(BaseModel):
    texts: List[str]
    predictions: List[Tag]


async def tag_single_request(text: str, tags: List[Tag]) -> Tag:
    allowed_tags = [(tag.id, tag.name) for tag in tags]
    allowed_tags_str = ", ".join([f"`{tag}`" for tag in allowed_tags])
    return await client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {
                "role": "system",
                "content": "You are a world-class text tagging system.",
            },
            {"role": "user", "content": f"Describe the following text: `{text}`"},
            {
                "role": "user",
                "content": f"Here are the allowed tags: {allowed_tags_str}",
            },
        ],
        response_model=Tag,
        # Minizises the hallucination of tags that are not in the allowed tags.
        validation_context={"tags": tags},
    )


async def tag_request(request: TagRequest) -> TagResponse:
    predictions = await asyncio.gather(
        *[tag_single_request(text, request.tags) for text in request.texts]
    )
    return TagResponse(
        texts=request.texts,
        predictions=predictions,
    )


if __name__ == "__main__":
    # Tags will be a range of different topics.
    # Such as personal, phone, email, etc.
    tags = [
        TagWithInstructions(id=0, name="personal", instructions="Personal information"),
        TagWithInstructions(id=1, name="phone", instructions="Phone number"),
        TagWithInstructions(id=2, name="email", instructions="Email address"),
        TagWithInstructions(id=3, name="address", instructions="Address"),
        TagWithInstructions(id=4, name="Other", instructions="Other information"),
    ]

    # Texts will be a range of different questions.
    # Such as "How much does it cost?", "What is your privacy policy?", etc.
    texts = [
        "What is your phone number?",
        "What is your email address?",
        "What is your address?",
        "What is your privacy policy?",
    ]

    # The request will contain the texts and the tags.
    request = TagRequest(texts=texts, tags=tags)

    # The response will contain the texts, the predicted tags, and the confidence.
    response = asyncio.run(tag_request(request))
    print(response.model_dump_json(indent=2))