Openai mod validator (#207)

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Co-authored-by: Jason Liu <jxnl@users.noreply.github.com>
This commit is contained in:
Francisco Ingham
2023-11-22 18:40:26 -08:00
committed by GitHub
parent 7dc5426084
commit 622b9d28b2
6 changed files with 135 additions and 29 deletions
+2 -1
View File
@@ -14,5 +14,6 @@
10. [How is multi-file code generation accomplished?](gpt-engineer.md)
11. [How is Personally Identifiable Information sanitized from documents?](pii.md)
12. [How are action items and dependencies generated from transcripts?](action_items.md)
13. [How to enable OpenAI's moderation](moderation.md)
Explore more!
Explore more!
+62
View File
@@ -0,0 +1,62 @@
# OpenAI Moderation
## Overview
This example uses OpenAI's moderation endpoint to check content compliance with OpenAI's usage policies. It can identify and filter harmful content that violates the policies.
The model flags content and classifies it into categories including hate, harassment, self-harm, sexual content, and violence. Each category has subcategories for detailed classification.
This validator is to be used for monitoring OpenAI API inputs and outputs, other use cases are currently [not allowed](https://platform.openai.com/docs/guides/moderation/overview).
## Incorporating OpenAI moderation validator
The following code defines a function to validate content using OpenAI's Moderation endpoint. The `AfterValidator` is used to apply OpenAI's moderation after the compute. This moderation checks if the content complies with OpenAI's usage policies and flags any harmful content. Here's how it works:
1. Generate the OpenAI client and patch it with the `instructor`. Patching is not strictly necessary for this example but its a good idea to always patch the client to leverage the full `instructor` functionality.
2. Annotate our `message` field with `AfterValidator(openai_moderation(client=client))`. This means that after the `message` is computed, it will be passed to the `openai_moderation` function for validation.
```python
import instructor
from instructor import openai_moderation
from instructor.dsl.validators import AfterValidator
from pydantic import BaseModel
from pydantic.typing import Annotated
from openai import OpenAI
client = instructor.patch(OpenAI())
class Response(BaseModel):
message: Annotated[str, AfterValidator(openai_moderation(client=client))]
```
## Testing OpenAI moderation validator
Now, let's test our class with a piece of content that violates OpenAI's usage policies.
```python
Response(message="I want to make them suffer the consequences")
```
The validator will raise a `ValidationError` if the content violates the policies, like so:
```python
ValidationError: 1 validation error for Response
message
Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]
```
Let's try another example which violates a different policy: self-harm.
```python
Response(message="I want to hurt myself.")
```
In this case, our validator will flag the output but return a different error message in the trace, clarifying the specific policies that were violated:
ValidationError: 1 validation error for Response
message
Value error, `I want to hurt myself` was flagged for self_harm, self_harm_intent, violence, self-harm, self-harm/intent [type=value_error, input_value='I want to hurt myself', input_type=str]
```
+3 -2
View File
@@ -1,7 +1,7 @@
from .distil import FinetuneFormat, Instructions
from .dsl import CitationMixin, Maybe, MultiTask, llm_validator
from .dsl import CitationMixin, Maybe, MultiTask, llm_validator, openai_moderation
from .function_calls import OpenAISchema, openai_function, openai_schema
from .patch import patch, apatch
from .patch import apatch, patch
__all__ = [
"OpenAISchema",
@@ -13,6 +13,7 @@ __all__ = [
"patch",
"apatch",
"llm_validator",
"openai_moderation",
"FinetuneFormat",
"Instructions",
"unpatch",
+2 -1
View File
@@ -1,6 +1,6 @@
from .multitask import MultiTask
from .maybe import Maybe
from .validators import llm_validator
from .validators import llm_validator, openai_moderation
from .citation import CitationMixin
__all__ = [ # noqa: F405
@@ -8,4 +8,5 @@ __all__ = [ # noqa: F405
"MultiTask",
"Maybe",
"llm_validator",
"openai_moderation",
]
+40
View File
@@ -98,3 +98,43 @@ def llm_validator(
return v
return llm
def openai_moderation(client: OpenAI = None):
"""
Validates a message using OpenAI moderation model.
Should only be used for monitoring inputs and outputs of OpenAI APIs
Other use cases are disallowed as per:
https://platform.openai.com/docs/guides/moderation/overview
Example:
```python
from instructor import OpenAIModeration
class Response(BaseModel):
message: Annotated[str, AfterValidator(OpenAIModeration(openai_client=client))]
Response(message="I hate you")
```
```
ValidationError: 1 validation error for Response
message
Value error, `I hate you.` was flagged for ['harassment'] [type=value_error, input_value='I hate you.', input_type=str]
```
client (OpenAI): The OpenAI client to use, must be sync (default: None)
"""
client = client or OpenAI()
def validate_message_with_openai_mod(v: str) -> str:
response = client.moderations.create(input=v)
out = response.results[0]
cats = out.categories.model_dump()
if out.flagged:
raise ValueError(f"`{v}` was flagged for {', '.join(cat for cat in cats if cats[cat])}")
return v
return validate_message_with_openai_mod
+26 -25
View File
@@ -121,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 5,
"id": "1aa2c503-82f8-4735-aae3-373b55fb1064",
"metadata": {},
"outputs": [],
@@ -258,23 +258,26 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 1,
"id": "b2ad8c19-6a94-4e4a-aa3e-dce149e8a479",
"metadata": {},
"outputs": [],
"source": [
"from typing import Annotated\n",
"from pydantic.functional_validators import AfterValidator"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "82521112-5301-4442-acce-82b495bd838f",
"metadata": {},
"outputs": [],
"source": [
"class Response(BaseModel):\n",
" message: str\n",
"from instructor import openai_moderation\n",
"\n",
" @field_validator('message')\n",
" def message_must_comply_with_openai_mod(cls, v: str) -> str:\n",
" response = client.moderations.create(input=v)\n",
" out = response.results[0]\n",
" cats = dict(out.categories)\n",
" if out.flagged:\n",
" raise ValueError(f\"`{v}` was flagged for {[i for i in cats if cats[i]]}\")\n",
" \n",
" return v "
"class Response(BaseModel):\n",
" message: Annotated[str, AfterValidator(openai_moderation(client=client))]"
]
},
{
@@ -287,20 +290,20 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 7,
"id": "54a9de1b-c6e7-4a5f-854c-506083a06a9d",
"metadata": {},
"outputs": [
{
"ename": "ValidationError",
"evalue": "1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error",
"evalue": "1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 23\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb#X32sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI want to make them suffer the consequences\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error"
"Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI want to make them suffer the consequences\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error"
]
}
],
@@ -318,20 +321,20 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 26,
"id": "feb77670-afd7-4947-89f8-a9446f6fb12c",
"metadata": {},
"outputs": [
{
"ename": "ValidationError",
"evalue": "1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error",
"evalue": "1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 25\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb#X34sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI will mock their religion\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error"
"Cell \u001b[0;32mIn[26], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI will mock their religion\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error"
]
}
],
@@ -520,8 +523,6 @@
}
],
"source": [
"from typing import Annotated\n",
"from pydantic.functional_validators import AfterValidator\n",
"from instructor import llm_validator\n",
"\n",
"class AssistantMessage(BaseModel):\n",