mirror of
https://github.com/kennethreitz/instructor.git
synced 2026-06-05 22:50:18 +00:00
226 lines
6.5 KiB
Python
226 lines
6.5 KiB
Python
from typing import List, Optional
|
|
from openai import OpenAI
|
|
from pydantic import (
|
|
BaseModel,
|
|
Field,
|
|
ValidationError,
|
|
ValidationInfo,
|
|
field_validator,
|
|
model_validator,
|
|
)
|
|
|
|
import instructor
|
|
|
|
client = instructor.patch(OpenAI())
|
|
|
|
"""
|
|
Example 1) Simple Substring check that compares a citation to a text chunk
|
|
"""
|
|
|
|
|
|
class Statements(BaseModel):
|
|
body: str
|
|
substring_quote: str
|
|
|
|
@field_validator("substring_quote")
|
|
@classmethod
|
|
def substring_quote_exists(cls, v: str, info: ValidationInfo):
|
|
context = info.context.get("text_chunks", None)
|
|
|
|
# Check if the substring_quote is in the text_chunk
|
|
# if not, raise an error
|
|
for text_chunk in context.values():
|
|
if v in text_chunk:
|
|
return v
|
|
raise ValueError(
|
|
f"Could not find substring_quote `{v}` in contexts",
|
|
)
|
|
|
|
|
|
class AnswerWithCitaton(BaseModel):
|
|
question: str
|
|
answer: List[Statements]
|
|
|
|
|
|
try:
|
|
AnswerWithCitaton.model_validate(
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"answer": [
|
|
{"body": "Paris", "substring_quote": "Paris is the capital of France"},
|
|
],
|
|
},
|
|
context={
|
|
"text_chunks": {
|
|
1: "Jason is a pirate",
|
|
2: "Paris is not the capital of France",
|
|
3: "Irrelevant data",
|
|
}
|
|
},
|
|
)
|
|
except ValidationError as e:
|
|
print(e)
|
|
"""
|
|
answer.0.substring_quote
|
|
Value error, Could not find substring_quote `Paris is the capital of France` in contexts [type=value_error, input_value='Paris is the capital of France', input_type=str]
|
|
For further information visit https://errors.pydantic.dev/2.4/v/value_error
|
|
"""
|
|
|
|
|
|
"""
|
|
Example 2) Using an LLM to verify if a
|
|
"""
|
|
|
|
|
|
class Validation(BaseModel):
|
|
"""
|
|
Verfication response from the LLM,
|
|
the error message should be detailed if the is_valid is False
|
|
but keep it to less than 100 characters, reference specific
|
|
attributes that you are comparing, use `...` is the string is too long
|
|
"""
|
|
|
|
is_valid: bool
|
|
error_messages: Optional[str] = Field(None, description="Error messages if any")
|
|
|
|
|
|
class Statements(BaseModel):
|
|
body: str
|
|
substring_quote: str
|
|
|
|
@model_validator(mode="after")
|
|
def substring_quote_exists(self, info: ValidationInfo):
|
|
context = info.context.get("text_chunks", None)
|
|
|
|
resp: Validation = client.chat.completions.create(
|
|
response_model=Validation,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": f"Does the following citation exist in the following context?\n\nCitation: {self.substring_quote}\n\nContext: {context}",
|
|
}
|
|
],
|
|
model="gpt-3.5-turbo",
|
|
)
|
|
|
|
if resp.is_valid:
|
|
return self
|
|
|
|
raise ValueError(resp.error_messages)
|
|
|
|
|
|
class AnswerWithCitaton(BaseModel):
|
|
question: str
|
|
answer: List[Statements]
|
|
|
|
|
|
resp = AnswerWithCitaton.model_validate(
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"answer": [
|
|
{"body": "Paris", "substring_quote": "Paris is the capital of France"},
|
|
],
|
|
},
|
|
context={
|
|
"text_chunks": {
|
|
1: "Jason is a pirate",
|
|
2: "Paris is the capital of France",
|
|
3: "Irrelevant data",
|
|
}
|
|
},
|
|
)
|
|
# output: notice that there are no errors
|
|
print(resp.model_dump_json(indent=2))
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"answer": [{"body": "Paris", "substring_quote": "Paris is the capital of France"}],
|
|
}
|
|
|
|
# Now we change the text chunk to something else, and we get an error
|
|
try:
|
|
AnswerWithCitaton.model_validate(
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"answer": [
|
|
{"body": "Paris", "substring_quote": "Paris is the capital of France"},
|
|
],
|
|
},
|
|
context={
|
|
"text_chunks": {
|
|
1: "Jason is a pirate",
|
|
2: "Paris is not the capital of France",
|
|
3: "Irrelevant data",
|
|
}
|
|
},
|
|
)
|
|
except ValidationError as e:
|
|
print(e)
|
|
"""
|
|
1 validation error for AnswerWithCitaton
|
|
answer.0
|
|
Value error, Citation not found in context [type=value_error, input_value={'body': 'Paris', 'substr... the capital of France'}, input_type=dict]
|
|
For further information visit https://errors.pydantic.dev/2.4/v/value_error
|
|
"""
|
|
|
|
# Example 3) Using an LLM to verify if the citations and the answers are all aligned
|
|
|
|
|
|
# we keep the same model as above for Statements, but we add a new model for the answer
|
|
# that also verifies that the citations are aligned with the answers
|
|
class AnswerWithCitaton(BaseModel):
|
|
question: str
|
|
answer: List[Statements]
|
|
|
|
@model_validator(mode="after")
|
|
def validate_answer(self, info: ValidationInfo):
|
|
context = info.context.get("text_chunks", None)
|
|
|
|
resp: Validation = client.chat.completions.create(
|
|
response_model=Validation,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": f"Does the following answers match the question and the context?\n\nQuestion: {self.question}\n\nAnswer: {self.answer}\n\nContext: {context}",
|
|
}
|
|
],
|
|
model="gpt-3.5-turbo",
|
|
)
|
|
|
|
if resp.is_valid:
|
|
return self
|
|
|
|
raise ValueError(resp.error_messages)
|
|
|
|
|
|
"""
|
|
Using LLMs for citation verification is inefficient during runtime.
|
|
However, we can utilize them to create a dataset consisting only of accurate responses
|
|
where citations must be valid (as determined by LLM, fuzzy text search, etc.).
|
|
|
|
This approach would require an initial investment during data generation to obtain
|
|
a finely-tuned model for improved citation.
|
|
"""
|
|
try:
|
|
AnswerWithCitaton.model_validate(
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"answer": [
|
|
{"body": "Texas", "substring_quote": "Paris is the capital of France"},
|
|
],
|
|
},
|
|
context={
|
|
"text_chunks": {
|
|
1: "Jason is a pirate",
|
|
2: "Paris is the capital of France",
|
|
3: "Irrelevant data",
|
|
}
|
|
},
|
|
)
|
|
except ValidationError as e:
|
|
print(e)
|
|
"""
|
|
1 validation error for AnswerWithCitaton
|
|
Value error, The answer does not match the question and context [type=value_error, input_value={'question': 'What is the...he capital of France'}]}, input_type=dict]
|
|
For further information visit https://errors.pydantic.dev/2.4/v/value_error
|
|
"""
|