# Part of this code is adapted from the following examples from OpenAI Cookbook: # https://cookbook.openai.com/examples/how_to_stream_completions # https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb import time import tiktoken import instructor from openai import OpenAI from pydantic import BaseModel client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON) def num_tokens_from_string(string: str, model_name: str) -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.encoding_for_model(model_name) num_tokens = len(encoding.encode(string)) num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> return num_tokens class User(BaseModel): name: str role: str age: int PartialUser = instructor.Partial[User] def benchmark_raw_stream(model="gpt-4"): content = f"""Respond only in JSON that would validate to this schema and include nothing extra. Otherwise something bad will happen:\n {User.model_json_schema()}""" start_time = time.time() extraction_stream = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": content}, { "role": "user", "content": "give me a harry pottery character in json, name, role, age", }, ], stream=True, ) collected_messages = [chunk.choices[0].delta.content for chunk in extraction_stream] collected_messages = [m for m in collected_messages if m is not None] collected_messages = "".join(collected_messages) User.model_validate_json(collected_messages) end_time = time.time() - start_time output_tokens = num_tokens_from_string(collected_messages, model) char_per_sec = output_tokens / end_time return char_per_sec def benchmark_partial_streaming(model="gpt-4"): start_time = time.time() extraction_stream = client.chat.completions.create( model=model, response_model=PartialUser, messages=[ { "role": "user", "content": "give me a harry pottery character in json, name, role, age", } ], stream=True, ) for chunk in extraction_stream: # noqa: B007 pass end_time = time.time() - start_time output_tokens = num_tokens_from_string(chunk.model_dump_json(), model) char_per_sec = output_tokens / end_time return char_per_sec if __name__ == "__main__": partial_times = [ benchmark_partial_streaming(model="gpt-3.5-turbo-1106") for _ in range(10) ] avg_partial_time = sum(partial_times) / len(partial_times) raw_times = [benchmark_raw_stream(model="gpt-3.5-turbo") for _ in range(10)] avg_raw_time = sum(raw_times) / len(raw_times) print(f"Raw streaming: {avg_raw_time:.2f} tokens/sec") print(f"Partial streaming: {avg_partial_time:.2f} token/sec") print(f"Relative speedup: {avg_partial_time / avg_raw_time:.2f}x") """ Raw streaming: 22.36 tokens/sec Partial streaming: 15.46 token/sec Relative speedup: 0.69x """