instructor/examples/partial_streaming/benchmark.py

# Part of this code is adapted from the following examples from OpenAI Cookbook:
# https://cookbook.openai.com/examples/how_to_stream_completions
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
import time
import tiktoken
import instructor
from openai import OpenAI
from pydantic import BaseModel

client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)


def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)

    num_tokens = len(encoding.encode(string))
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>

    return num_tokens


class User(BaseModel):
    name: str
    role: str
    age: int


PartialUser = instructor.Partial[User]


def benchmark_raw_stream(model="gpt-4"):
    content = f"""Respond only in JSON that would validate to this schema and include nothing extra.
    Otherwise something bad will happen:\n {User.model_json_schema()}"""

    start_time = time.time()
    extraction_stream = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": content},
            {
                "role": "user",
                "content": "give me a harry pottery character in json, name, role, age",
            },
        ],
        stream=True,
    )

    collected_messages = [chunk.choices[0].delta.content for chunk in extraction_stream]
    collected_messages = [m for m in collected_messages if m is not None]
    collected_messages = "".join(collected_messages)
    User.model_validate_json(collected_messages)
    end_time = time.time() - start_time

    output_tokens = num_tokens_from_string(collected_messages, model)
    char_per_sec = output_tokens / end_time
    return char_per_sec


def benchmark_partial_streaming(model="gpt-4"):
    start_time = time.time()
    extraction_stream = client.chat.completions.create(
        model=model,
        response_model=PartialUser,
        messages=[
            {
                "role": "user",
                "content": "give me a harry pottery character in json, name, role, age",
            }
        ],
        stream=True,
    )

    for chunk in extraction_stream:  # noqa: B007
        pass
    end_time = time.time() - start_time

    output_tokens = num_tokens_from_string(chunk.model_dump_json(), model)
    char_per_sec = output_tokens / end_time
    return char_per_sec


if __name__ == "__main__":
    partial_times = [
        benchmark_partial_streaming(model="gpt-3.5-turbo-1106") for _ in range(10)
    ]
    avg_partial_time = sum(partial_times) / len(partial_times)

    raw_times = [benchmark_raw_stream(model="gpt-3.5-turbo") for _ in range(10)]
    avg_raw_time = sum(raw_times) / len(raw_times)
    print(f"Raw streaming: {avg_raw_time:.2f} tokens/sec")

    print(f"Partial streaming: {avg_partial_time:.2f} token/sec")
    print(f"Relative speedup: {avg_partial_time / avg_raw_time:.2f}x")

    """
    Raw streaming: 22.36 tokens/sec
    Partial streaming: 15.46 token/sec
    Relative speedup: 0.69x
    """