From 622b9d28b258b24900e35c0275b4690eb75f35c8 Mon Sep 17 00:00:00 2001 From: Francisco Ingham <24279597+fpingham@users.noreply.github.com> Date: Wed, 22 Nov 2023 18:40:26 -0800 Subject: [PATCH] Openai mod validator (#207) Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Jason Liu --- docs/examples/index.md | 3 +- docs/examples/moderation.md | 62 ++++++++++++++++++++++++++++++++++++ instructor/__init__.py | 5 +-- instructor/dsl/__init__.py | 3 +- instructor/dsl/validators.py | 40 +++++++++++++++++++++++ tutorials/5.validation.ipynb | 51 ++++++++++++++--------------- 6 files changed, 135 insertions(+), 29 deletions(-) create mode 100644 docs/examples/moderation.md diff --git a/docs/examples/index.md b/docs/examples/index.md index 71a293f..7223922 100644 --- a/docs/examples/index.md +++ b/docs/examples/index.md @@ -14,5 +14,6 @@ 10. [How is multi-file code generation accomplished?](gpt-engineer.md) 11. [How is Personally Identifiable Information sanitized from documents?](pii.md) 12. [How are action items and dependencies generated from transcripts?](action_items.md) +13. [How to enable OpenAI's moderation](moderation.md) -Explore more! +Explore more! \ No newline at end of file diff --git a/docs/examples/moderation.md b/docs/examples/moderation.md new file mode 100644 index 0000000..d4bb2ae --- /dev/null +++ b/docs/examples/moderation.md @@ -0,0 +1,62 @@ +# OpenAI Moderation + +## Overview + +This example uses OpenAI's moderation endpoint to check content compliance with OpenAI's usage policies. It can identify and filter harmful content that violates the policies. + +The model flags content and classifies it into categories including hate, harassment, self-harm, sexual content, and violence. Each category has subcategories for detailed classification. + +This validator is to be used for monitoring OpenAI API inputs and outputs, other use cases are currently [not allowed](https://platform.openai.com/docs/guides/moderation/overview). + +## Incorporating OpenAI moderation validator + +The following code defines a function to validate content using OpenAI's Moderation endpoint. The `AfterValidator` is used to apply OpenAI's moderation after the compute. This moderation checks if the content complies with OpenAI's usage policies and flags any harmful content. Here's how it works: + +1. Generate the OpenAI client and patch it with the `instructor`. Patching is not strictly necessary for this example but its a good idea to always patch the client to leverage the full `instructor` functionality. + +2. Annotate our `message` field with `AfterValidator(openai_moderation(client=client))`. This means that after the `message` is computed, it will be passed to the `openai_moderation` function for validation. + +```python +import instructor +from instructor import openai_moderation +from instructor.dsl.validators import AfterValidator + +from pydantic import BaseModel +from pydantic.typing import Annotated +from openai import OpenAI + +client = instructor.patch(OpenAI()) + +class Response(BaseModel): + message: Annotated[str, AfterValidator(openai_moderation(client=client))] +``` + +## Testing OpenAI moderation validator + +Now, let's test our class with a piece of content that violates OpenAI's usage policies. + +```python +Response(message="I want to make them suffer the consequences") +``` + +The validator will raise a `ValidationError` if the content violates the policies, like so: + +```python +ValidationError: 1 validation error for Response +message + Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str] +``` + +Let's try another example which violates a different policy: self-harm. + +```python +Response(message="I want to hurt myself.") +``` + +In this case, our validator will flag the output but return a different error message in the trace, clarifying the specific policies that were violated: + +ValidationError: 1 validation error for Response +message + Value error, `I want to hurt myself` was flagged for self_harm, self_harm_intent, violence, self-harm, self-harm/intent [type=value_error, input_value='I want to hurt myself', input_type=str] +``` + diff --git a/instructor/__init__.py b/instructor/__init__.py index 4bd40d7..2aa2cb3 100644 --- a/instructor/__init__.py +++ b/instructor/__init__.py @@ -1,7 +1,7 @@ from .distil import FinetuneFormat, Instructions -from .dsl import CitationMixin, Maybe, MultiTask, llm_validator +from .dsl import CitationMixin, Maybe, MultiTask, llm_validator, openai_moderation from .function_calls import OpenAISchema, openai_function, openai_schema -from .patch import patch, apatch +from .patch import apatch, patch __all__ = [ "OpenAISchema", @@ -13,6 +13,7 @@ __all__ = [ "patch", "apatch", "llm_validator", + "openai_moderation", "FinetuneFormat", "Instructions", "unpatch", diff --git a/instructor/dsl/__init__.py b/instructor/dsl/__init__.py index 282aa4f..c44a5eb 100644 --- a/instructor/dsl/__init__.py +++ b/instructor/dsl/__init__.py @@ -1,6 +1,6 @@ from .multitask import MultiTask from .maybe import Maybe -from .validators import llm_validator +from .validators import llm_validator, openai_moderation from .citation import CitationMixin __all__ = [ # noqa: F405 @@ -8,4 +8,5 @@ __all__ = [ # noqa: F405 "MultiTask", "Maybe", "llm_validator", + "openai_moderation", ] diff --git a/instructor/dsl/validators.py b/instructor/dsl/validators.py index 373e7ba..e67fcec 100644 --- a/instructor/dsl/validators.py +++ b/instructor/dsl/validators.py @@ -98,3 +98,43 @@ def llm_validator( return v return llm + +def openai_moderation(client: OpenAI = None): + """ + Validates a message using OpenAI moderation model. + + Should only be used for monitoring inputs and outputs of OpenAI APIs + Other use cases are disallowed as per: + https://platform.openai.com/docs/guides/moderation/overview + + Example: + ```python + from instructor import OpenAIModeration + + class Response(BaseModel): + message: Annotated[str, AfterValidator(OpenAIModeration(openai_client=client))] + + Response(message="I hate you") + ``` + + ``` + ValidationError: 1 validation error for Response + message + Value error, `I hate you.` was flagged for ['harassment'] [type=value_error, input_value='I hate you.', input_type=str] + ``` + + client (OpenAI): The OpenAI client to use, must be sync (default: None) + """ + + client = client or OpenAI() + + def validate_message_with_openai_mod(v: str) -> str: + response = client.moderations.create(input=v) + out = response.results[0] + cats = out.categories.model_dump() + if out.flagged: + raise ValueError(f"`{v}` was flagged for {', '.join(cat for cat in cats if cats[cat])}") + + return v + + return validate_message_with_openai_mod diff --git a/tutorials/5.validation.ipynb b/tutorials/5.validation.ipynb index 88b8f15..5e94f84 100644 --- a/tutorials/5.validation.ipynb +++ b/tutorials/5.validation.ipynb @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 5, "id": "1aa2c503-82f8-4735-aae3-373b55fb1064", "metadata": {}, "outputs": [], @@ -258,23 +258,26 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 1, + "id": "b2ad8c19-6a94-4e4a-aa3e-dce149e8a479", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Annotated\n", + "from pydantic.functional_validators import AfterValidator" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "82521112-5301-4442-acce-82b495bd838f", "metadata": {}, "outputs": [], "source": [ - "class Response(BaseModel):\n", - " message: str\n", + "from instructor import openai_moderation\n", "\n", - " @field_validator('message')\n", - " def message_must_comply_with_openai_mod(cls, v: str) -> str:\n", - " response = client.moderations.create(input=v)\n", - " out = response.results[0]\n", - " cats = dict(out.categories)\n", - " if out.flagged:\n", - " raise ValueError(f\"`{v}` was flagged for {[i for i in cats if cats[i]]}\")\n", - " \n", - " return v " + "class Response(BaseModel):\n", + " message: Annotated[str, AfterValidator(openai_moderation(client=client))]" ] }, { @@ -287,20 +290,20 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 7, "id": "54a9de1b-c6e7-4a5f-854c-506083a06a9d", "metadata": {}, "outputs": [ { "ename": "ValidationError", - "evalue": "1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error", + "evalue": "1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 23\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI want to make them suffer the consequences\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", - "File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n", - "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error" + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI want to make them suffer the consequences\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error" ] } ], @@ -318,20 +321,20 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 26, "id": "feb77670-afd7-4947-89f8-a9446f6fb12c", "metadata": {}, "outputs": [ { "ename": "ValidationError", - "evalue": "1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error", + "evalue": "1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 25\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI will mock their religion\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", - "File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n", - "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.4/v/value_error" + "Cell \u001b[0;32mIn[26], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI will mock their religion\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n For further information visit https://errors.pydantic.dev/2.5/v/value_error" ] } ], @@ -520,8 +523,6 @@ } ], "source": [ - "from typing import Annotated\n", - "from pydantic.functional_validators import AfterValidator\n", "from instructor import llm_validator\n", "\n", "class AssistantMessage(BaseModel):\n",