improve documentation

2026-06-05 22:50:18 +00:00 · 2023-09-13 13:39:09 -04:00
parent e025eaa64f
commit 61cff08909
17 changed files with 453 additions and 145 deletions
@@ -0,0 +1,137 @@
+# Entity Resolution and Visualization for Legal Documents
+
+In this guide, we demonstrate how to extract and resolve entities from a sample legal contract. Then, we visualize these entities and their dependencies as an entity graph. This approach can be invaluable for legal tech applications, aiding in the understanding of complex documents.
+
+!!! tips "Motivation"
+    Legal contracts are full of intricate details and interconnected clauses. Automatically extracting and visualizing these elements can make it easier to understand the document's overall structure and terms.
+
+## Defining the Data Structures
+
+The **`Entity`** and **`Property`** classes model extracted entities and their attributes. **`DocumentExtraction`** encapsulates a list of these entities.
+
+```python
+from pydantic import BaseModel, Field
+from typing import List
+
+
+class Property(BaseModel):
+    key: str
+    value: str
+    resolved_absolute_value: str
+
+
+class Entity(BaseModel):
+    id: int = Field(
+        ...,
+        description="Unique identifier for the entity, used for deduplication, design a scheme allows multiple entities",
+    )
+    subquote_string: List[str] = Field(
+        ...,
+        description="Correctly resolved value of the entity, if the entity is a reference to another entity, this should be the id of the referenced entity, include a few more words before and after the value to allow for some context to be used in the resolution",
+    )
+    entity_title: str
+    properties: List[Property] = Field(
+        ..., description="List of properties of the entity"
+    )
+    dependencies: List[int] = Field(
+        ...,
+        description="List of entity ids that this entity depends  or relies on to resolve it",
+    )
+
+
+class DocumentExtraction(BaseModel):
+    entities: List[Entity] = Field(
+        ...,
+        description="Body of the answer, each fact should be its seperate object with a body and a list of sources",
+    )
+```
+
+## Entity Extraction and Resolution
+
+The **`ask_ai`** function utilizes OpenAI's API to extract and resolve entities from the input content.
+
+```python
+import openai
+import instructor
+
+instructor.patch()
+
+def ask_ai(content) -> DocumentExtraction:
+    return openai.ChatCompletion.create(
+        model="gpt-4",
+        response_model=DocumentExtraction,
+        messages=[
+            {
+                "role": "system",
+                "content": "Extract and resolve a list of entities from the following document:",
+            },
+            {
+                "role": "user",
+                "content": content,
+            },
+        ],
+    )  # type: ignore
+```
+
+## Graph Visualization
+
+**`generate_graph`** takes the extracted entities and visualizes them using Graphviz. It creates nodes for each entity and edges for their dependencies.
+
+```python
+from graphviz import Digraph
+
+def generate_html_label(entity: Entity) -> str:
+    rows = [f"<tr><td>{prop.key}</td><td>{prop.resolved_absolute_value}</td></tr>" for prop in entity.properties]
+    table_rows = "".join(rows)
+    return f"<table border='0' cellborder='1' cellspacing='0'><tr><td colspan='2'><b>{entity.entity_title}</b></td></tr>{table_rows}</table>>"
+
+def generate_graph(data: DocumentExtraction):
+    dot = Digraph(comment="Entity Graph", node_attr={"shape": "plaintext"})
+    
+    for entity in data.entities:
+        label = generate_html_label(entity)
+        dot.node(str(entity.id), label)
+    
+    for entity in data.entities:
+        for dep_id in entity.dependencies:
+            dot.edge(str(entity.id), str(dep_id))
+    
+    dot.render("entity.gv", view=True)
+```
+
+## Execution
+
+Finally, execute the code to visualize the entity graph for the sample legal contract.
+
+```python
+content = """
+Sample Legal Contract
+Agreement Contract
+
+This Agreement is made and entered into on 2020-01-01 by and between Company A ("the Client") and Company B ("the Service Provider").
+
+Article 1: Scope of Work
+
+The Service Provider will deliver the software product to the Client 30 days after the agreement date.
+
+Article 2: Payment Terms
+
+The total payment for the service is $50,000.
+An initial payment of $10,000 will be made within 7 days of the the signed date.
+The final payment will be due 45 days after [SignDate].
+
+Article 3: Confidentiality
+
+The parties agree not to disclose any confidential information received from the other party for 3 months after the final payment date.
+
+Article 4: Termination
+
+The contract can be terminated with a 30-day notice, unless there are outstanding obligations that must be fulfilled after the [DeliveryDate].
+"""  # Your legal contract here
+model = ask_ai(content)
+generate_graph(model)
+```
+
+This will produce a graphical representation of the entities and their dependencies, stored as "entity.gv". 
+
+![Entity Graph](entity_resolution.png)
@@ -6,13 +6,18 @@

 - [Self-Assessment via Validators](self_critique.md): Implement AI self-assessment with `llm_validator`.

+- [Citations via Regex](exact_citations.md): Retrieve exact citations using regular expressions and smart prompting.
+
 - [Extracting Search Queries](search.md): Segment search queries through function calling and multi-task definitions.

+- [Generating Knowledge Graphs](knowledge_graph.md): Generate knowledge graphs from a question
+
 - [Query Decomposition](planning-tasks.md): Decompose complex queries into subqueries in a single request.

+- [Entity Extraction and Resolution](entity_resolution.md): Extract and resolve entities from a document.
+
 - [Working with Recursive Schemas](recursive.md): Implement and understand recursive schemas.

- [Citations via Regex](exact_citations.md): Retrieve exact citations using regular expressions and smart prompting.

 - [Table Extraction from Text](autodataframe.md): Extract tables, potentially multiple, automatically from textual data.

@@ -0,0 +1,87 @@
+# Visualizing Knowledge Graphs for Complex Topics
+
+In this guide, you'll discover how to visualize a detailed knowledge graph for understanding complex topics, in this case, quantum mechanics. We leverage OpenAI's API and the Graphviz library to bring structure to intricate subjects.
+
+!!! tips "Motivation"
+    Knowledge graphs offer a visually appealing and coherent way to understand complicated topics like quantum mechanics. By generating these graphs automatically, you can accelerate the learning process and make it easier to digest complex information.
+
+## Defining the Structures
+
+Let's model a knowledge graph with **`Node`** and **`Edge`** objects. **`Node`** objects represent key concepts or entities, while **`Edge`** objects indicate the relationships between them.
+
+```python
+from pydantic import BaseModel, Field
+from typing import List
+
+class Node(BaseModel):
+    id: int
+    label: str
+    color: str
+
+class Edge(BaseModel):
+    source: int
+    target: int
+    label: str
+    color: str = "black"
+
+class KnowledgeGraph(BaseModel):
+    nodes: List[Node] = Field(..., default_factory=list)
+    edges: List[Edge] = Field(..., default_factory=list)
+```
+
+## Generating Knowledge Graphs
+
+The **`generate_graph`** function leverages OpenAI's API to generate a knowledge graph based on the input query.
+
+```python
+import openai
+
+def generate_graph(input) -> KnowledgeGraph:
+    return openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"Help me understand the following by describing it as a detailed knowledge graph: {input}",
+            }
+        ],
+        response_model=KnowledgeGraph,
+    )  # type: ignore
+```
+
+## Visualizing the Graph
+
+The **`visualize_knowledge_graph`** function uses the Graphviz library to render the generated knowledge graph.
+
+```python
+from graphviz import Digraph
+
+def visualize_knowledge_graph(kg: KnowledgeGraph):
+    dot = Digraph(comment="Knowledge Graph")
+
+    # Add nodes
+    for node in kg.nodes:
+        dot.node(str(node.id), node.label, color=node.color)
+
+    # Add edges
+    for edge in kg.edges:
+        dot.edge(str(edge.source), str(edge.target), label=edge.label, color=edge.color)
+
+    # Render the graph
+    dot.render("knowledge_graph.gv", view=True)
+```
+
+## Putting It All Together
+
+Execute the code to generate and visualize a knowledge graph for understanding quantum mechanics.
+
+```python
+graph: KnowledgeGraph = generate_graph("Teach me about quantum mechanics")
+visualize_knowledge_graph(graph)
+```
+
+![Knowledge Graph](knowledge_graph.png)
+
+This will produce a visual representation of the knowledge graph, stored as "knowledge_graph.gv". You can open this file to explore the key concepts and their relationships in quantum mechanics.
+
+By leveraging automated knowledge graphs, you can dissect complex topics into digestible pieces, making the learning journey less daunting and more effective.
@@ -3,7 +3,7 @@ from stats_dict import stats_dict
 import json

 # Sample data
-query_data = {i: line for i, line in enumerate(open("test.jsonl", "r"))}
+query_data = {i: line.strip() for i, line in enumerate(open("test.jsonl", "r"))}

 # Initialize selected keys
 selected_keys = {}
@@ -27,15 +27,13 @@ def render_dropdown_and_button(stats_key):
    st.subheader("Histogram")
    st.bar_chart(stats_dict[stats_key]["counter"], use_container_width=True)

-    st.subheader("Select keys to view lines")
    options = list(stats_dict[stats_key]["counter"].keys())
    selected_keys[stats_key] = st.multiselect(
        f"View samples with {stats_key}",
        options,
        default=selected_keys.get(stats_key, []),
    )
-    if st.button(f"Show Selected for {stats_key}"):
-        st.code(get_lines(stats_key, selected_keys[stats_key]))
+    st.code(get_lines(stats_key, selected_keys[stats_key]))


 # Sidebar for navigation
@@ -46,7 +44,7 @@ page = st.sidebar.selectbox(
 )

 # Main Streamlit App
-st.title("Query Data Visualizer")
+st.title("Structured Output Evaluation")

 # Validation Stats
 if page == "Validation Stats":
@@ -0,0 +1,57 @@
+from graphviz import Digraph
+from pydantic import BaseModel, Field
+from typing import List
+import openai
+import instructor
+
+instructor.patch()
+
+
+class Node(BaseModel):
+    id: int
+    label: str
+    color: str
+
+
+class Edge(BaseModel):
+    source: int
+    target: int
+    label: str
+    color: str = "black"
+
+
+class KnowledgeGraph(BaseModel):
+    nodes: List[Node] = Field(..., default_factory=list)
+    edges: List[Edge] = Field(..., default_factory=list)
+
+
+def generate_graph(input) -> KnowledgeGraph:
+    return openai.ChatCompletion.create(
+        model="gpt-3.5-turbo-16k",
+        messages=[
+            {
+                "role": "user",
+                "content": f"Help me understand following by describing as a detailed knowledge graph: {input}",
+            }
+        ],
+        response_model=KnowledgeGraph,
+    )  # type: ignore
+
+
+def visualize_knowledge_graph(kg: KnowledgeGraph):
+    dot = Digraph(comment="Knowledge Graph")
+
+    # Add nodes
+    for node in kg.nodes:
+        dot.node(str(node.id), node.label, color=node.color)
+
+    # Add edges
+    for edge in kg.edges:
+        dot.edge(str(edge.source), str(edge.target), label=edge.label, color=edge.color)
+
+    # Render the graph
+    dot.render("knowledge_graph.gv", view=True)
+
+
+graph: KnowledgeGraph = generate_graph("Teach me about quantum mechanics")
+visualize_knowledge_graph(graph)
@@ -0,0 +1,119 @@
+from typing import List, Optional
+import instructor
+from pydantic import BaseModel, Field
+import openai
+import enum
+
+instructor.patch()
+
+
+class Action(enum.Enum):
+    CREATE = "create_task"
+    DELETE = "close_task"
+    UPDATE = "update_task"
+
+
+class Projects(enum.Enum):
+    FRONTLINE_QA_AI = "frontline_qa_ai"
+    FUTURE_OF_PROGRAMMING = "future_of_programming"
+    PERSONAL_SITE = "personal_site"
+    NORDIC_HAMSTRING_CURLS = "nordic_hamstring_curls"
+
+
+class Buckets(enum.Enum):
+    FINANCE = "finance"
+    PURVIEW_OPERATIONS = "purview_operations"
+    TASKBOT = "taskbot"
+    CHECKBOT = "checkbot"
+    NIGHT_HACKING = "night_hacking"
+    TICKLER = "tickler"
+
+
+class TaskAction(BaseModel):
+    id: int
+    method: Action = Field(
+        description="Method of creating, for closing a task the task, to close a task only a id is required"
+    )
+    waiting_on: Optional[List[int]] = Field(
+        None, description="IDs of tasks that this task is waiting on"
+    )
+    name: Optional[str] = Field(None, description="Name of the task")
+    notes: Optional[str] = Field(None, description="Notes about the task")
+    bucket: Optional[Buckets] = Field(
+        None, description="Bucket of the task, to set, or update"
+    )
+    project: Optional[Projects] = Field(
+        None, description="Project of the task, to set, or update"
+    )
+
+
+class Response(BaseModel):
+    text: str = Field(description="The text of the response")
+    task_action: Optional[List[TaskAction]] = Field(
+        description="The action to take on the task"
+    )
+
+
+initial_messages = [
+    {
+        "role": "system",
+        "content": "You are an AI assistant. have the ability to create, update, and close tasks.",
+    },
+    {
+        "role": "assistant",
+        "content": """
+        The task is below. When assisting the user, reference the details from this task.
+
+        [BEGIN TASK]
+            id: 23
+            Name: Create 10 new GIFs
+            Description: Create 10 new GIFs for the Taskbot page on the user's personal site. They should be similar to the existing GIFs, but with different use cases.
+            Projects: Personal site
+            Buckets: Taskbot
+            Updates:
+        [BEGIN UPDATE]
+            **User Update - September 01, 2023 03:58:00 PM EDT**
+            The user plans to create the GIFs in the background as they work through their daily tasks. They aim to produce about one to two GIFs per day. If this plan doesn't work, they will reconsider their strategy.
+        [END UPDATE]
+        [END TASK]
+    """,
+    },
+    {"role": "assistant", "content": "What's up with this task?"},
+    {
+        "role": "user",
+        "content": "Change it to 20, then make a new task for when its done make 20 more that moves.",
+    },
+]
+
+response: Response = openai.ChatCompletion.create(
+    messages=initial_messages, response_model=Response, model="gpt-4"
+)  # type: ignore
+
+print(response.model_dump_json(indent=2))
+"""
+{
+  "text": "Updating task to create 20 GIFs and creating a new task to create an additional 20 animated GIFs after the initial task is done.",
+  "task_action": [
+    {
+      "id": 23,
+      "method": "update_task",
+      "waiting_on": null,
+      "name": "Create 20 new GIFs",
+      "notes": "The user increased the number of GIFs from 10 to 20. They plan to create these as they work through their daily tasks, creating about one to two GIFs per day. If this plan doesn't work, they will reconsider their strategy.",
+      "bucket": "taskbot",
+      "project": "personal_site"
+    },
+    {
+      "id": 24,
+      "method": "create_task",
+      "waiting_on": [
+        23
+      ],
+      "name": "Create 20 new animated GIFs",
+      "notes": "The task will be initiated once the task with id 23 is completed.",
+      "bucket": "taskbot",
+      "project": "personal_site"
+    }
+  ]
+}
+"""
@@ -1,20 +0,0 @@
-# Legal Document Entity Resolution
-
-This example demonstrates how to use an entity resolution system to extract and resolve entities from a legal document. The system leverages OpenAI's GPT-4 language model to achieve this task. The primary purpose of this example is to showcase the capabilities of the entity resolution system in a simple and illustrative manner.
-
-## Overview
-The entity resolution system processes a given legal document and identifies key entities such as parties, dates, terms, and clauses. It then resolves relevant information to provide a structured output. This example uses a Python script to interact with the system and demonstrates the process with a sample legal contract.
-
-## How to Use
-
-* **Input Document:** Provide the legal document you want to analyze. The document should include relevant legal terms, dates, parties' names, and other pertinent information.
-
-* **Entity Extraction:** The system employs the GPT-4 model to extract entities from the input document.
-
-* **Entity Resolution:** Extracted entities are resolved to their absolute values when applicable. For instance, relative date phrases are converted to specific dates.
-
-* **Dependency Handling:** The system identifies dependencies between entities. If one entity's resolution depends on another's, it ensures proper order of resolution.
-
-## Limitations
-
-The context window is the biggest limitation of the size of document, but I imagine a system where you stream chunks of the document into a model, that acculimates the entities in some state and formats a simple version back into the prompt (id, name, absolute_resolved_value) and the output emits only 'new' entities, thinking of it as a acculilating the object.
@@ -1,10 +1,13 @@
 from typing import List
-from instructor import patch
+from graphviz import Digraph
 from pydantic import BaseModel, Field

+import instructor
 import openai

-patch()
+# Patch openai to use instructor
+# allows for response_model
+instructor.patch()


 class Property(BaseModel):
@@ -57,6 +60,36 @@ def ask_ai(content) -> DocumentExtraction:
    return resp


+def generate_html_label(entity: Entity) -> str:
+    rows = [
+        f"<tr><td>{prop.key}</td><td>{prop.resolved_absolute_value}</td></tr>"
+        for prop in entity.properties
+    ]
+    table_rows = "".join(rows)
+    return f"""<
+    <table border="0" cellborder="1" cellspacing="0">
+    <tr><td colspan="2"><b>{entity.entity_title}</b></td></tr>
+    {table_rows}
+    </table>>"""
+
+
+def generate_graph(data: DocumentExtraction):
+    dot = Digraph(comment="Entity Graph", node_attr={"shape": "plaintext"})
+
+    # Add nodes
+    for entity in data.entities:
+        label = generate_html_label(entity)
+        dot.node(str(entity.id), label)
+
+    # Add edges
+    for entity in data.entities:
+        for dep_id in entity.dependencies:
+            dot.edge(str(entity.id), str(dep_id))
+
+    # Render graph
+    dot.render("entity.gz", view=True)
+
+
 content = """
 Sample Legal Contract
 Agreement Contract
@@ -83,116 +116,4 @@ The contract can be terminated with a 30-day notice, unless there are outstandin
 """

 model = ask_ai(content)
-print(model.model_dump_json(indent=2))
-
-"""
-{
-  "entities": [
-    {
-      "id": 1,
-      "subquote_string": [
-        "This Agreement is made and entered into on 2020-01-01 by and between Company A (\"the Client\") and Company B (\"the Service Provider\")."
-      ],
-      "entity_title": "Agreement between Company A and Company B",
-      "properties": [
-        {
-          "key": "Date",
-          "value": "2020-01-01",
-          "resolved_absolute_value": "2020-01-01"
-        },
-        {
-          "key": "Party 1",
-          "value": "Company A",
-          "resolved_absolute_value": "Company A"
-        },
-        {
-          "key": "Party 2",
-          "value": "Company B",
-          "resolved_absolute_value": "Company B"
-        }
-      ],
-      "dependencies": []
-    },
-    {
-      "id": 2,
-      "subquote_string": [
-        "The Service Provider will deliver the software product to the Client 30 days after the agreement date."
-      ],
-      "entity_title": "Scope of Work",
-      "properties": [
-        {
-          "key": "Delivery Date",
-          "value": "30 days after the agreement date",
-          "resolved_absolute_value": "2020-01-31"
-        }
-      ],
-      "dependencies": [
-        1
-      ]
-    },
-    {
-      "id": 3,
-      "subquote_string": [
-        "The total payment for the service is $50,000.",
-        "An initial payment of $10,000 will be made within 7 days of the the signed date.",
-        "The final payment will be due 45 days after [SignDate]."
-      ],
-      "entity_title": "Payment Terms",
-      "properties": [
-        {
-          "key": "Total Payment",
-          "value": "$50,000",
-          "resolved_absolute_value": "50000"
-        },
-        {
-          "key": "Initial Payment",
-          "value": "$10,000",
-          "resolved_absolute_value": "10000"
-        },
-        {
-          "key": "Final Payment Due Date",
-          "value": "45 days after [SignDate]",
-          "resolved_absolute_value": "2020-02-15"
-        }
-      ],
-      "dependencies": [
-        1
-      ]
-    },
-    {
-      "id": 4,
-      "subquote_string": [
-        "The parties agree not to disclose any confidential information received from the other party for 3 months after the final payment date."
-      ],
-      "entity_title": "Confidentiality Terms",
-      "properties": [
-        {
-          "key": "Confidentiality Duration",
-          "value": "3 months after the final payment date",
-          "resolved_absolute_value": "2020-05-15"
-        }
-      ],
-      "dependencies": [
-        3
-      ]
-    },
-    {
-      "id": 5,
-      "subquote_string": [
-        "The contract can be terminated with a 30-day notice, unless there are outstanding obligations that must be fulfilled after the [DeliveryDate]."
-      ],
-      "entity_title": "Termination",
-      "properties": [
-        {
-          "key": "Termination Notice",
-          "value": "30-day",
-          "resolved_absolute_value": "30 days"
-        }
-      ],
-      "dependencies": [
-        2
-      ]
-    }
-  ]
-}
-"""
+generate_graph(model)
@@ -140,9 +140,11 @@ def wrap_chatcompletion(func: Callable) -> Callable:
    return wrapper_function


-def process_response(response, response_model):
+def process_response(response, response_model, validation_context=None):
    if response_model is not None:
-        model = response_model.from_response(response)
+        model = response_model.from_response(
+            response, validation_context=validation_context
+        )
        model._raw_response = response
        return model
    return response
@@ -61,18 +61,20 @@ nav:
        - Multiple Extractions: "multitask.md"
        - Handling Missing Content: "maybe.md"
      - Philosophy: 'philosophy.md'
-  - Use Cases:
+  - Cookbook:
    - Overview: 'examples/index.md'
    - Text Classification Techniques: 'examples/classification.md'
    - AI Self-Assessment: 'examples/self_critique.md'
    - Citation Retrieval via Regex: 'examples/exact_citations.md'
+    - Knowledge Graph Generation: 'examples/knowledge_graph.md'
+    - Entity Resolution: 'examples/entity_resolution.md'
    - Search Query Segmentation: 'examples/search.md'
    - Query Decomposition in One Go: 'examples/planning-tasks.md'
    - Working with Recursive Schemas: 'examples/recursive.md'
    - Table Extraction from Text: 'examples/autodataframe.md'
    - Action Item and Dependency Mapping: 'examples/action_items.md'
    - Multi-File Code Generation: 'examples/gpt-engineer.md'
-    - PII Data Sanitization: 'examples/pii.md'
+    - PII Data Sanitization: 'examples/pii.md
  - CLI Reference:
      - "Introduction": "cli/index.md"
      - "Usage Tracking": "cli/usage.md"