Openai mod validator (#207)

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Jason Liu <jxnl@users.noreply.github.com>
2026-06-05 22:50:18 +00:00 · 2023-11-22 18:40:26 -08:00
parent 7dc5426084
commit 622b9d28b2
6 changed files with 135 additions and 29 deletions
@@ -14,5 +14,6 @@
 10. [How is multi-file code generation accomplished?](gpt-engineer.md)
 11. [How is Personally Identifiable Information sanitized from documents?](pii.md)
 12. [How are action items and dependencies generated from transcripts?](action_items.md)
+13. [How to enable OpenAI's moderation](moderation.md)

-Explore more!
+Explore more!
@@ -0,0 +1,62 @@
+# OpenAI Moderation
+
+## Overview
+
+This example uses OpenAI's moderation endpoint to check content compliance with OpenAI's usage policies. It can identify and filter harmful content that violates the policies.
+
+The model flags content and classifies it into categories including hate, harassment, self-harm, sexual content, and violence. Each category has subcategories for detailed classification.
+
+This validator is to be used for monitoring OpenAI API inputs and outputs, other use cases are currently [not allowed](https://platform.openai.com/docs/guides/moderation/overview).
+
+## Incorporating OpenAI moderation validator
+
+The following code defines a function to validate content using OpenAI's Moderation endpoint. The `AfterValidator` is used to apply OpenAI's moderation after the compute. This moderation checks if the content complies with OpenAI's usage policies and flags any harmful content. Here's how it works:
+
+1. Generate the OpenAI client and patch it with the `instructor`. Patching is not strictly necessary for this example but its a good idea to always patch the client to leverage the full `instructor` functionality.
+
+2. Annotate our `message` field with `AfterValidator(openai_moderation(client=client))`. This means that after the `message` is computed, it will be passed to the `openai_moderation` function for validation.
+
+```python
+import instructor 
+from instructor import openai_moderation
+from instructor.dsl.validators import AfterValidator
+
+from pydantic import BaseModel
+from pydantic.typing import Annotated
+from openai import OpenAI
+
+client = instructor.patch(OpenAI())
+
+class Response(BaseModel):
+    message: Annotated[str, AfterValidator(openai_moderation(client=client))]
+```
+
+## Testing OpenAI moderation validator
+
+Now, let's test our class with a piece of content that violates OpenAI's usage policies.
+
+```python
+Response(message="I want to make them suffer the consequences")
+```
+
+The validator will raise a `ValidationError` if the content violates the policies, like so:
+
+```python
+ValidationError: 1 validation error for Response
+message
+  Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]
+```
+
+Let's try another example which violates a different policy: self-harm. 
+
+```python
+Response(message="I want to hurt myself.")
+```
+
+In this case, our validator will flag the output but return a different error message in the trace, clarifying the specific policies that were violated:
+
+ValidationError: 1 validation error for Response
+message
+  Value error, `I want to hurt myself` was flagged for self_harm, self_harm_intent, violence, self-harm, self-harm/intent [type=value_error, input_value='I want to hurt myself', input_type=str]
+```
+
@@ -1,7 +1,7 @@
 from .distil import FinetuneFormat, Instructions
-from .dsl import CitationMixin, Maybe, MultiTask, llm_validator
+from .dsl import CitationMixin, Maybe, MultiTask, llm_validator, openai_moderation
 from .function_calls import OpenAISchema, openai_function, openai_schema
-from .patch import patch, apatch
+from .patch import apatch, patch

 __all__ = [
    "OpenAISchema",
@@ -13,6 +13,7 @@ __all__ = [
    "patch",
    "apatch",
    "llm_validator",
+    "openai_moderation",
    "FinetuneFormat",
    "Instructions",
    "unpatch",
@@ -1,6 +1,6 @@
 from .multitask import MultiTask
 from .maybe import Maybe
-from .validators import llm_validator
+from .validators import llm_validator, openai_moderation
 from .citation import CitationMixin

 __all__ = [  # noqa: F405
@@ -8,4 +8,5 @@ __all__ = [  # noqa: F405
    "MultiTask",
    "Maybe",
    "llm_validator",
+    "openai_moderation",
 ]
@@ -98,3 +98,43 @@ def llm_validator(
        return v

    return llm
+
+def openai_moderation(client: OpenAI = None):
+    """
+    Validates a message using OpenAI moderation model.
+
+    Should only be used for monitoring inputs and outputs of OpenAI APIs
+    Other use cases are disallowed as per:
+    https://platform.openai.com/docs/guides/moderation/overview
+
+    Example:
+    ```python
+    from instructor import OpenAIModeration
+
+    class Response(BaseModel):
+        message: Annotated[str, AfterValidator(OpenAIModeration(openai_client=client))]
+
+    Response(message="I hate you")
+    ```
+
+    ```
+     ValidationError: 1 validation error for Response
+     message
+    Value error, `I hate you.` was flagged for ['harassment'] [type=value_error, input_value='I hate you.', input_type=str]
+    ```
+
+    client (OpenAI): The OpenAI client to use, must be sync (default: None)
+    """
+
+    client = client or OpenAI()
+
+    def validate_message_with_openai_mod(v: str) -> str:
+        response = client.moderations.create(input=v)
+        out = response.results[0]
+        cats = out.categories.model_dump()
+        if out.flagged:
+            raise ValueError(f"`{v}` was flagged for {', '.join(cat for cat in cats if cats[cat])}")
+        
+        return v
+    
+    return validate_message_with_openai_mod
@@ -121,7 +121,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 5,
   "id": "1aa2c503-82f8-4735-aae3-373b55fb1064",
   "metadata": {},
   "outputs": [],
@@ -258,23 +258,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 1,
+   "id": "b2ad8c19-6a94-4e4a-aa3e-dce149e8a479",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Annotated\n",
+    "from pydantic.functional_validators import AfterValidator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
   "id": "82521112-5301-4442-acce-82b495bd838f",
   "metadata": {},
   "outputs": [],
   "source": [
-    "class Response(BaseModel):\n",
-    "    message: str\n",
+    "from instructor import openai_moderation\n",
    "\n",
-    "    @field_validator('message')\n",
-    "    def message_must_comply_with_openai_mod(cls, v: str) -> str:\n",
-    "        response = client.moderations.create(input=v)\n",
-    "        out = response.results[0]\n",
-    "        cats = dict(out.categories)\n",
-    "        if out.flagged:\n",
-    "            raise ValueError(f\"`{v}` was flagged for {[i for i in cats if cats[i]]}\")\n",
-    "        \n",
-    "        return v "
+    "class Response(BaseModel):\n",
+    "    message: Annotated[str, AfterValidator(openai_moderation(client=client))]"
   ]
  },
  {
@@ -287,20 +290,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 7,
   "id": "54a9de1b-c6e7-4a5f-854c-506083a06a9d",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValidationError",
-     "evalue": "1 validation error for Response\nmessage\n  Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.4/v/value_error",
+     "evalue": "1 validation error for Response\nmessage\n  Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.5/v/value_error",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
-      "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 23\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb#X32sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI want to make them suffer the consequences\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
-      "File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n",
-      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n  Value error, `I want to make them suffer the consequences` was flagged for ['harassment', 'harassment_threatening', 'violence', 'harassment/threatening'] [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.4/v/value_error"
+      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI want to make them suffer the consequences\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n  Value error, `I want to make them suffer the consequences` was flagged for harassment, harassment_threatening, violence, harassment/threatening [type=value_error, input_value='I want to make them suffer the consequences', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.5/v/value_error"
     ]
    }
   ],
@@ -318,20 +321,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 26,
   "id": "feb77670-afd7-4947-89f8-a9446f6fb12c",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValidationError",
-     "evalue": "1 validation error for Response\nmessage\n  Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.4/v/value_error",
+     "evalue": "1 validation error for Response\nmessage\n  Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.5/v/value_error",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
-      "\u001b[1;32m/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb Cell 25\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jasonliu/dev/instructor/tutorials/5.validation.ipynb#X34sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m Response(message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mI will mock their religion\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
-      "File \u001b[0;32m~/dev/instructor/.venv/lib/python3.11/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[39m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    163\u001b[0m __tracebackhide__ \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m __pydantic_self__\u001b[39m.\u001b[39;49m__pydantic_validator__\u001b[39m.\u001b[39;49mvalidate_python(data, self_instance\u001b[39m=\u001b[39;49m__pydantic_self__)\n",
-      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n  Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.4/v/value_error"
+      "Cell \u001b[0;32mIn[26], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mResponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mI will mock their religion\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.virtualenvs/pampa-labs/lib/python3.10/site-packages/pydantic/main.py:164\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    163\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 164\u001b[0m \u001b[43m__pydantic_self__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__pydantic_self__\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for Response\nmessage\n  Value error, `I will mock their religion` was flagged for ['harassment'] [type=value_error, input_value='I will mock their religion', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.5/v/value_error"
     ]
    }
   ],
@@ -520,8 +523,6 @@
    }
   ],
   "source": [
-    "from typing import Annotated\n",
-    "from pydantic.functional_validators import AfterValidator\n",
    "from instructor import llm_validator\n",
    "\n",
    "class AssistantMessage(BaseModel):\n",