from typing import List, Optional from openai import OpenAI from pydantic import ( BaseModel, Field, ValidationError, ValidationInfo, field_validator, model_validator, ) import instructor client = instructor.patch(OpenAI()) """ Example 1) Simple Substring check that compares a citation to a text chunk """ class Statements(BaseModel): body: str substring_quote: str @field_validator("substring_quote") @classmethod def substring_quote_exists(cls, v: str, info: ValidationInfo): context = info.context.get("text_chunks", None) # Check if the substring_quote is in the text_chunk # if not, raise an error for text_chunk in context.values(): if v in text_chunk: return v raise ValueError( f"Could not find substring_quote `{v}` in contexts", ) class AnswerWithCitaton(BaseModel): question: str answer: List[Statements] try: AnswerWithCitaton.model_validate( { "question": "What is the capital of France?", "answer": [ {"body": "Paris", "substring_quote": "Paris is the capital of France"}, ], }, context={ "text_chunks": { 1: "Jason is a pirate", 2: "Paris is not the capital of France", 3: "Irrelevant data", } }, ) except ValidationError as e: print(e) """ answer.0.substring_quote Value error, Could not find substring_quote `Paris is the capital of France` in contexts [type=value_error, input_value='Paris is the capital of France', input_type=str] For further information visit https://errors.pydantic.dev/2.4/v/value_error """ """ Example 2) Using an LLM to verify if a """ class Validation(BaseModel): """ Verfication response from the LLM, the error message should be detailed if the is_valid is False but keep it to less than 100 characters, reference specific attributes that you are comparing, use `...` is the string is too long """ is_valid: bool error_messages: Optional[str] = Field(None, description="Error messages if any") class Statements(BaseModel): body: str substring_quote: str @model_validator(mode="after") def substring_quote_exists(self, info: ValidationInfo): context = info.context.get("text_chunks", None) resp: Validation = client.chat.completions.create( response_model=Validation, messages=[ { "role": "user", "content": f"Does the following citation exist in the following context?\n\nCitation: {self.substring_quote}\n\nContext: {context}", } ], model="gpt-3.5-turbo", ) if resp.is_valid: return self raise ValueError(resp.error_messages) class AnswerWithCitaton(BaseModel): question: str answer: List[Statements] resp = AnswerWithCitaton.model_validate( { "question": "What is the capital of France?", "answer": [ {"body": "Paris", "substring_quote": "Paris is the capital of France"}, ], }, context={ "text_chunks": { 1: "Jason is a pirate", 2: "Paris is the capital of France", 3: "Irrelevant data", } }, ) # output: notice that there are no errors print(resp.model_dump_json(indent=2)) { "question": "What is the capital of France?", "answer": [{"body": "Paris", "substring_quote": "Paris is the capital of France"}], } # Now we change the text chunk to something else, and we get an error try: AnswerWithCitaton.model_validate( { "question": "What is the capital of France?", "answer": [ {"body": "Paris", "substring_quote": "Paris is the capital of France"}, ], }, context={ "text_chunks": { 1: "Jason is a pirate", 2: "Paris is not the capital of France", 3: "Irrelevant data", } }, ) except ValidationError as e: print(e) """ 1 validation error for AnswerWithCitaton answer.0 Value error, Citation not found in context [type=value_error, input_value={'body': 'Paris', 'substr... the capital of France'}, input_type=dict] For further information visit https://errors.pydantic.dev/2.4/v/value_error """ # Example 3) Using an LLM to verify if the citations and the answers are all aligned # we keep the same model as above for Statements, but we add a new model for the answer # that also verifies that the citations are aligned with the answers class AnswerWithCitaton(BaseModel): question: str answer: List[Statements] @model_validator(mode="after") def validate_answer(self, info: ValidationInfo): context = info.context.get("text_chunks", None) resp: Validation = client.chat.completions.create( response_model=Validation, messages=[ { "role": "user", "content": f"Does the following answers match the question and the context?\n\nQuestion: {self.question}\n\nAnswer: {self.answer}\n\nContext: {context}", } ], model="gpt-3.5-turbo", ) if resp.is_valid: return self raise ValueError(resp.error_messages) """ Using LLMs for citation verification is inefficient during runtime. However, we can utilize them to create a dataset consisting only of accurate responses where citations must be valid (as determined by LLM, fuzzy text search, etc.). This approach would require an initial investment during data generation to obtain a finely-tuned model for improved citation. """ try: AnswerWithCitaton.model_validate( { "question": "What is the capital of France?", "answer": [ {"body": "Texas", "substring_quote": "Paris is the capital of France"}, ], }, context={ "text_chunks": { 1: "Jason is a pirate", 2: "Paris is the capital of France", 3: "Irrelevant data", } }, ) except ValidationError as e: print(e) """ 1 validation error for AnswerWithCitaton Value error, The answer does not match the question and context [type=value_error, input_value={'question': 'What is the...he capital of France'}]}, input_type=dict] For further information visit https://errors.pydantic.dev/2.4/v/value_error """