mirror of
https://github.com/kennethreitz/instructor.git
synced 2026-06-05 22:50:18 +00:00
101 lines
3.1 KiB
Python
101 lines
3.1 KiB
Python
# Part of this code is adapted from the following examples from OpenAI Cookbook:
|
|
# https://cookbook.openai.com/examples/how_to_stream_completions
|
|
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
|
|
import time
|
|
import tiktoken
|
|
import instructor
|
|
from openai import OpenAI
|
|
from pydantic import BaseModel
|
|
|
|
client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)
|
|
|
|
|
|
def num_tokens_from_string(string: str, model_name: str) -> int:
|
|
"""Returns the number of tokens in a text string."""
|
|
encoding = tiktoken.encoding_for_model(model_name)
|
|
|
|
num_tokens = len(encoding.encode(string))
|
|
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
|
|
|
|
return num_tokens
|
|
|
|
|
|
class User(BaseModel):
|
|
name: str
|
|
role: str
|
|
age: int
|
|
|
|
|
|
PartialUser = instructor.Partial[User]
|
|
|
|
|
|
def benchmark_raw_stream(model="gpt-4"):
|
|
content = f"""Respond only in JSON that would validate to this schema and include nothing extra.
|
|
Otherwise something bad will happen:\n {User.model_json_schema()}"""
|
|
|
|
start_time = time.time()
|
|
extraction_stream = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": content},
|
|
{
|
|
"role": "user",
|
|
"content": "give me a harry pottery character in json, name, role, age",
|
|
},
|
|
],
|
|
stream=True,
|
|
)
|
|
|
|
collected_messages = [chunk.choices[0].delta.content for chunk in extraction_stream]
|
|
collected_messages = [m for m in collected_messages if m is not None]
|
|
collected_messages = "".join(collected_messages)
|
|
User.model_validate_json(collected_messages)
|
|
end_time = time.time() - start_time
|
|
|
|
output_tokens = num_tokens_from_string(collected_messages, model)
|
|
char_per_sec = output_tokens / end_time
|
|
return char_per_sec
|
|
|
|
|
|
def benchmark_partial_streaming(model="gpt-4"):
|
|
start_time = time.time()
|
|
extraction_stream = client.chat.completions.create(
|
|
model=model,
|
|
response_model=PartialUser,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "give me a harry pottery character in json, name, role, age",
|
|
}
|
|
],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in extraction_stream: # noqa: B007
|
|
pass
|
|
end_time = time.time() - start_time
|
|
|
|
output_tokens = num_tokens_from_string(chunk.model_dump_json(), model)
|
|
char_per_sec = output_tokens / end_time
|
|
return char_per_sec
|
|
|
|
|
|
if __name__ == "__main__":
|
|
partial_times = [
|
|
benchmark_partial_streaming(model="gpt-3.5-turbo-1106") for _ in range(10)
|
|
]
|
|
avg_partial_time = sum(partial_times) / len(partial_times)
|
|
|
|
raw_times = [benchmark_raw_stream(model="gpt-3.5-turbo") for _ in range(10)]
|
|
avg_raw_time = sum(raw_times) / len(raw_times)
|
|
print(f"Raw streaming: {avg_raw_time:.2f} tokens/sec")
|
|
|
|
print(f"Partial streaming: {avg_partial_time:.2f} token/sec")
|
|
print(f"Relative speedup: {avg_partial_time / avg_raw_time:.2f}x")
|
|
|
|
"""
|
|
Raw streaming: 22.36 tokens/sec
|
|
Partial streaming: 15.46 token/sec
|
|
Relative speedup: 0.69x
|
|
"""
|