From 8383795b0e5183b4c03b080f59dd182b4e074e21 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 7 Oct 2023 14:43:19 -0400 Subject: [PATCH 01/19] distil --- instructor/__init__.py | 4 + instructor/distil.py | 212 +++++++++++++++++++++++++++++++++++++++++ tests/test_distil.py | 69 ++++++++++++++ 3 files changed, 285 insertions(+) create mode 100644 instructor/distil.py create mode 100644 tests/test_distil.py diff --git a/instructor/__init__.py b/instructor/__init__.py index 6a659e5..fdee478 100644 --- a/instructor/__init__.py +++ b/instructor/__init__.py @@ -1,4 +1,5 @@ from .function_calls import OpenAISchema, openai_function, openai_schema +from .distil import FinetuneFormat, distil, track from .dsl import MultiTask, Maybe, llm_validator, CitationMixin from .patch import patch @@ -11,4 +12,7 @@ __all__ = [ "openai_schema", "patch", "llm_validator", + "FinetuneFormat", + "distil", + "track", ] diff --git a/instructor/distil.py b/instructor/distil.py new file mode 100644 index 0000000..1160e09 --- /dev/null +++ b/instructor/distil.py @@ -0,0 +1,212 @@ +import enum +import functools +import inspect +import json + +from typing import Any, Callable, Optional +from pydantic import BaseModel, validate_call + +import inspect +import logging + +from instructor import openai_schema + +distil_logger = logging.getLogger("instructor.distil") + + +def logging(level=logging.INFO, handler=None, log_to_file=True, filename_prefix=None): + """ + Configure the instructor module's logging. + + :param level: Log level. + :param handler: Optional logging handler. If not provided, defaults to FileHandler or NullHandler based on log_to_file. + :param log_to_file: If True and no handler is provided, logs to a file. + :param filename: Optional filename for logging if log_to_file is True. Defaults to 'instructor.log'. + """ + distil_logger.setLevel(level) + + # Clear existing handlers + for h in distil_logger.handlers[:]: + distil_logger.removeHandler(h) + + if handler: + distil_logger.addHandler(handler) + elif log_to_file: + filename = filename_prefix or "instructor.log" + file_handler = logging.FileHandler(filename) + file_handler.setFormatter(logging.Formatter("%(message)s")) + distil_logger.addHandler(file_handler) + else: + distil_logger.addHandler(logging.NullHandler()) + + +class FinetuneFormat(enum.Enum): + MESSAGES: str = "messages" + RAW: str = "raw" + + +def get_signature_from_fn(fn: Callable) -> str: + """ + Get the function signature as a string. + + :Example: + + >>> def my_function(a: int, b: int) -> int: + >>> return a + b + >>> + >>> get_signature_from_fn(my_function) + "def my_function(a: int, b: int) -> int" + + :param fn: Function to get the signature for. + :return: Function signature as a string. + """ + sig = inspect.signature(fn) + lines = f"def {fn.__name__}{sig}" + docstring = inspect.getdoc(fn) + if docstring: + formatted_docstring = f'"""\n{docstring}\n"""' + else: + formatted_docstring = "" + return f"{lines}\n{formatted_docstring}" + + +@functools.lru_cache() +def format_function(func: Callable) -> str: + """ + Format a function as a string with docstring and body. + """ + source_lines = inspect.getsourcelines(func) + definition = " ".join(source_lines[0]).strip() + + docstring = inspect.getdoc(func) + if docstring: + formatted_docstring = f'"""\n{docstring}\n"""' + else: + formatted_docstring = "" + + body = inspect.getsource(func) + body = body.replace(f"def {func.__name__}", "") + + return f"{definition}\n{formatted_docstring}\n{body}" + + +def is_return_type_base_model_or_instance(func: Callable[..., Any]) -> bool: + """ + Check if the return type of a function is a pydantic BaseModel or an instance of it. + + :param func: Function to check. + :return: True if the return type is a pydantic BaseModel or an instance of it. + """ + return_type = inspect.signature(func).return_annotation + return inspect.isclass(return_type) and issubclass(return_type, BaseModel) + + +@validate_call +def track( + fn: Callable[..., Any], + args: tuple, + kwargs: dict, + resp: BaseModel, + name: Optional[str] = None, + finetune_format: FinetuneFormat = FinetuneFormat.RAW, +): + """ + Track the function call and response in a log file, later used for finetuning. + + :param fn: Function to track. + :param args: Arguments passed to the function. + :param kwargs: Keyword arguments passed to the function. + :param resp: Response returned by the function. + :param name: Name of the function to track. Defaults to the function name. + :param finetune_format: Format to use for finetuning. Defaults to "raw". + """ + name = name if name else fn.__name__ + base_model: BaseModel = type(resp) + + if finetune_format == FinetuneFormat.RAW: + function_body = dict( + fn_name=name, + fn_repr=format_function(fn), + args=args, + kwargs=kwargs, + resp=resp.model_dump(), + schema=base_model.model_json_schema(), + ) + distil_logger.info(json.dumps(function_body)) + return + + if finetune_format == FinetuneFormat.MESSAGES: + # This is the format that OpenAI's API expects for a finetune call + openai_function_call = openai_schema(base_model).openai_schema + function_definition = get_signature_from_fn(fn) + function_body = { + "messages": [ + { + "role": "system", + "content": f"Return the response from the function call.\n\n {function_definition}", + }, + { + "role": "user", + "content": f"Return the results of the function with the following arguments:\n\n {name}(*{args}, **{kwargs})", + }, + { + "role": "function", + "function_call": { + "name": openai_function_call["name"], + "augments": resp.model_dump(), + }, + }, + ], + "functions": [openai_function_call], + "function_call": {"name": name}, + } + distil_logger.info(json.dumps(function_body)) + return + raise ValueError(f"Invalid finetune format: {finetune_format}") + + +def distil( + *args, + name: str = None, + mode: str = "distil", + fine_tune_format: FinetuneFormat = FinetuneFormat.RAW, +): + """ + Decorator to track the function call and response, supports distillation and dispatch modes. + + If used without arguments, it must be used as a decorator. + + :Example: + + >>> @distil + >>> def my_function() -> MyModel: + >>> return MyModel() + >>> + >>> @distil(name="my_function") + >>> def my_function() -> MyModel: + >>> return MyModel() + + :param fn: Function to track. + :param name: Name of the function to track. Defaults to the function name. + :param mode: Mode to use for distillation. Defaults to "distil". + """ + allowed_modes = {"distil", "dispatch"} + assert mode in allowed_modes, f"Must be in {allowed_modes}" + assert mode == "distil", "Only distil mode is supported at the moment." + + def _wrap_distil(fn): + msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" + assert is_return_type_base_model_or_instance(fn), msg + + @functools.wraps(fn) + def _distil(*args, **kwargs): + resp = fn(*args, **kwargs) + track(fn, args, kwargs, resp, name=name, finetune_format=fine_tune_format) + return resp + + return _distil + + if len(args) == 1 and callable(args[0]): + return _wrap_distil(args[0]) + + return _wrap_distil diff --git a/tests/test_distil.py b/tests/test_distil.py new file mode 100644 index 0000000..c07bd62 --- /dev/null +++ b/tests/test_distil.py @@ -0,0 +1,69 @@ +from pydantic import BaseModel +from instructor.distil import ( + distil, + format_function, + get_signature_from_fn, + is_return_type_base_model_or_instance, +) + +# Replace `your_module_name` with your actual module name + + +class SimpleModel(BaseModel): + data: int + + +def test_is_return_type_base_model_or_instance(): + def valid_function() -> SimpleModel: + return SimpleModel(data=1) + + def invalid_function() -> int: + return 1 + + assert is_return_type_base_model_or_instance(valid_function) + assert not is_return_type_base_model_or_instance(invalid_function) + + +def test_get_signature_from_fn(): + def test_function(a: int, b: str) -> float: + """Sample docstring""" + pass + + result = get_signature_from_fn(test_function) + expected = "def test_function(a: int, b: str) -> float" + assert expected in result + assert "Sample docstring" in result + + +def test_format_function(): + def sample_function(x: int) -> SimpleModel: + """This is a docstring.""" + return SimpleModel(data=x) + + formatted = format_function(sample_function) + assert "def sample_function(x: int) -> SimpleModel:" in formatted + assert '"""This is a docstring."""' in formatted + assert "return SimpleModel(data=x)" in formatted + + +def test_distil_decorator_without_arguments(): + @distil + def test_func(x: int) -> SimpleModel: + return SimpleModel(data=x) + + result = test_func(42) + assert result.data == 42 + + +def test_distil_decorator_with_name_argument(): + @distil(name="custom_name") + def another_test_func(x: int) -> SimpleModel: + return SimpleModel(data=x) + + result = another_test_func(55) + assert result.data == 55 + + +# Mock track function for decorator tests +def mock_track(*args, **kwargs): + pass From 79d872dd91ef67a312bfb838a5ed830662ea9c00 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 7 Oct 2023 17:52:12 -0400 Subject: [PATCH 02/19] tests and exmaple --- examples/distilations/user_code_global.py | 70 ++++++ instructor/__init__.py | 5 +- instructor/distil.py | 261 ++++++++++------------ tests/test_distil.py | 20 +- 4 files changed, 213 insertions(+), 143 deletions(-) create mode 100644 examples/distilations/user_code_global.py diff --git a/examples/distilations/user_code_global.py b/examples/distilations/user_code_global.py new file mode 100644 index 0000000..0b433bb --- /dev/null +++ b/examples/distilations/user_code_global.py @@ -0,0 +1,70 @@ +import datetime +import json +import os +import uuid +import requests +import logging + +from pydantic import BaseModel + +from instructor.distil import Instructions + + +class DatasetHandler(logging.Handler): + def __init__(self, dataset_name=None): + super().__init__() + self.url = os.environ.get("INSTRUCTOR_URL") + self.api_key = os.environ.get("INSTRUCTOR_KEY") + self.formatter = logging.Formatter("%(message)s") + self.dataset_name = dataset_name + self.uuid = str(uuid.uuid4()) + + def emit(self, record: logging.LogRecord) -> None: + log_entry = json.loads(self.format(record)) + + new_entry = { + "record": log_entry, + "dataset_name": self.dataset_name, + "batch_id": self.uuid, + "created_at": datetime.datetime.now().isoformat(), + } + + try: + response = requests.post(self.url, data=json.dumps(new_entry)) + if response.status_code != 200: + self.handleError(record) + except Exception: + self.handleError(record) + + +logging.basicConfig(level=logging.INFO) + +# Usage +instructions = Instructions( + name="test_distil", + log_handlers=[ + logging.FileHandler("finetunes.jsonl"), + DatasetHandler("finetunes_fo_test_distil"), + ], +) + + +class Response(BaseModel): + a: int + b: int + result: int + + +@instructions.distil +def fn(a: int, b: int) -> Response: + resp = a + b + return Response(a=a, b=b, result=resp) + + +if __name__ == "__main__": + import random + + for _ in range(10): + a = random.randint(100, 999) + b = random.randint(100, 999) + print("returning", fn(a, b)) diff --git a/instructor/__init__.py b/instructor/__init__.py index fdee478..1764c56 100644 --- a/instructor/__init__.py +++ b/instructor/__init__.py @@ -1,5 +1,5 @@ from .function_calls import OpenAISchema, openai_function, openai_schema -from .distil import FinetuneFormat, distil, track +from .distil import FinetuneFormat, Instructions from .dsl import MultiTask, Maybe, llm_validator, CitationMixin from .patch import patch @@ -13,6 +13,5 @@ __all__ = [ "patch", "llm_validator", "FinetuneFormat", - "distil", - "track", + "Instructions", ] diff --git a/instructor/distil.py b/instructor/distil.py index 1160e09..7b9c4be 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -2,43 +2,15 @@ import enum import functools import inspect import json +import logging +import os -from typing import Any, Callable, Optional +from typing import Any, Callable, List, Optional +import uuid from pydantic import BaseModel, validate_call -import inspect -import logging - from instructor import openai_schema -distil_logger = logging.getLogger("instructor.distil") - - -def logging(level=logging.INFO, handler=None, log_to_file=True, filename_prefix=None): - """ - Configure the instructor module's logging. - - :param level: Log level. - :param handler: Optional logging handler. If not provided, defaults to FileHandler or NullHandler based on log_to_file. - :param log_to_file: If True and no handler is provided, logs to a file. - :param filename: Optional filename for logging if log_to_file is True. Defaults to 'instructor.log'. - """ - distil_logger.setLevel(level) - - # Clear existing handlers - for h in distil_logger.handlers[:]: - distil_logger.removeHandler(h) - - if handler: - distil_logger.addHandler(handler) - elif log_to_file: - filename = filename_prefix or "instructor.log" - file_handler = logging.FileHandler(filename) - file_handler.setFormatter(logging.Formatter("%(message)s")) - distil_logger.addHandler(file_handler) - else: - distil_logger.addHandler(logging.NullHandler()) - class FinetuneFormat(enum.Enum): MESSAGES: str = "messages" @@ -101,112 +73,127 @@ def is_return_type_base_model_or_instance(func: Callable[..., Any]) -> bool: return inspect.isclass(return_type) and issubclass(return_type, BaseModel) -@validate_call -def track( - fn: Callable[..., Any], - args: tuple, - kwargs: dict, - resp: BaseModel, - name: Optional[str] = None, - finetune_format: FinetuneFormat = FinetuneFormat.RAW, -): - """ - Track the function call and response in a log file, later used for finetuning. +class Instructions: + def __init__( + self, + name: str = None, + id: str = None, + log_handlers: List[logging.Handler] = None, + ): + self.name = name + self.id = id or str(uuid.uuid4()) + self.unique_id = str(uuid.uuid4()) - :param fn: Function to track. - :param args: Arguments passed to the function. - :param kwargs: Keyword arguments passed to the function. - :param resp: Response returned by the function. - :param name: Name of the function to track. Defaults to the function name. - :param finetune_format: Format to use for finetuning. Defaults to "raw". - """ - name = name if name else fn.__name__ - base_model: BaseModel = type(resp) + self.logger = logging.getLogger(self.name) + for handler in log_handlers or []: + self.logger.addHandler(handler) - if finetune_format == FinetuneFormat.RAW: - function_body = dict( - fn_name=name, - fn_repr=format_function(fn), - args=args, - kwargs=kwargs, - resp=resp.model_dump(), - schema=base_model.model_json_schema(), - ) - distil_logger.info(json.dumps(function_body)) - return + def distil( + self, + *args, + name: str = None, + mode: str = "distil", + fine_tune_format: FinetuneFormat = FinetuneFormat.MESSAGES, + ): + """ + Decorator to track the function call and response, supports distillation and dispatch modes. - if finetune_format == FinetuneFormat.MESSAGES: - # This is the format that OpenAI's API expects for a finetune call - openai_function_call = openai_schema(base_model).openai_schema - function_definition = get_signature_from_fn(fn) - function_body = { - "messages": [ - { - "role": "system", - "content": f"Return the response from the function call.\n\n {function_definition}", - }, - { - "role": "user", - "content": f"Return the results of the function with the following arguments:\n\n {name}(*{args}, **{kwargs})", - }, - { - "role": "function", - "function_call": { - "name": openai_function_call["name"], - "augments": resp.model_dump(), + If used without arguments, it must be used as a decorator. + + :Example: + + >>> @distil + >>> def my_function() -> MyModel: + >>> return MyModel() + >>> + >>> @distil(name="my_function") + >>> def my_function() -> MyModel: + >>> return MyModel() + + :param fn: Function to track. + :param name: Name of the function to track. Defaults to the function name. + :param mode: Mode to use for distillation. Defaults to "distil". + """ + allowed_modes = {"distil", "dispatch"} + assert mode in allowed_modes, f"Must be in {allowed_modes}" + assert mode == "distil", "Only distil mode is supported at the moment." + + def _wrap_distil(fn): + msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" + assert is_return_type_base_model_or_instance(fn), msg + + @functools.wraps(fn) + def _distil(*args, **kwargs): + resp = fn(*args, **kwargs) + self.track( + fn, args, kwargs, resp, name=name, finetune_format=fine_tune_format + ) + + return resp + + return _distil + + if len(args) == 1 and callable(args[0]): + return _wrap_distil(args[0]) + + return _wrap_distil + + @validate_call + def track( + self, + fn: Callable[..., Any], + args: tuple, + kwargs: dict, + resp: BaseModel, + name: Optional[str] = None, + finetune_format: FinetuneFormat = FinetuneFormat.MESSAGES, + ): + """ + Track the function call and response in a log file, later used for finetuning. + + :param fn: Function to track. + :param args: Arguments passed to the function. + :param kwargs: Keyword arguments passed to the function. + :param resp: Response returned by the function. + :param name: Name of the function to track. Defaults to the function name. + :param finetune_format: Format to use for finetuning. Defaults to "raw". + """ + name = name if name else fn.__name__ + base_model: BaseModel = type(resp) + + if finetune_format == FinetuneFormat.RAW: + function_body = dict( + fn_name=name, + fn_repr=format_function(fn), + args=args, + kwargs=kwargs, + resp=resp.model_dump(), + schema=base_model.model_json_schema(), + ) + self.logger.info(json.dumps(function_body)) + + if finetune_format == FinetuneFormat.MESSAGES: + # This is the format that OpenAI's API expects for a finetune call + openai_function_call = openai_schema(base_model).openai_schema + function_definition = get_signature_from_fn(fn).replace(fn.__name__, name) + function_body = { + "messages": [ + { + "role": "system", + "content": f"Return the response from the function call.\n\n {function_definition}", }, - }, - ], - "functions": [openai_function_call], - "function_call": {"name": name}, - } - distil_logger.info(json.dumps(function_body)) - return - raise ValueError(f"Invalid finetune format: {finetune_format}") - - -def distil( - *args, - name: str = None, - mode: str = "distil", - fine_tune_format: FinetuneFormat = FinetuneFormat.RAW, -): - """ - Decorator to track the function call and response, supports distillation and dispatch modes. - - If used without arguments, it must be used as a decorator. - - :Example: - - >>> @distil - >>> def my_function() -> MyModel: - >>> return MyModel() - >>> - >>> @distil(name="my_function") - >>> def my_function() -> MyModel: - >>> return MyModel() - - :param fn: Function to track. - :param name: Name of the function to track. Defaults to the function name. - :param mode: Mode to use for distillation. Defaults to "distil". - """ - allowed_modes = {"distil", "dispatch"} - assert mode in allowed_modes, f"Must be in {allowed_modes}" - assert mode == "distil", "Only distil mode is supported at the moment." - - def _wrap_distil(fn): - msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" - assert is_return_type_base_model_or_instance(fn), msg - - @functools.wraps(fn) - def _distil(*args, **kwargs): - resp = fn(*args, **kwargs) - track(fn, args, kwargs, resp, name=name, finetune_format=fine_tune_format) - return resp - - return _distil - - if len(args) == 1 and callable(args[0]): - return _wrap_distil(args[0]) - - return _wrap_distil + { + "role": "user", + "content": f"Return the results of the function with the following arguments:\n\n {name}(*{args}, **{kwargs})", + }, + { + "role": "assistant", + "function_call": { + "name": openai_function_call["name"], + "arguments": resp.model_dump_json(), + }, + }, + ], + "functions": [openai_function_call], + } + self.logger.info(json.dumps(function_body)) diff --git a/tests/test_distil.py b/tests/test_distil.py index c07bd62..beed033 100644 --- a/tests/test_distil.py +++ b/tests/test_distil.py @@ -1,6 +1,8 @@ +from pyexpat import model +import openai from pydantic import BaseModel from instructor.distil import ( - distil, + Instructions, format_function, get_signature_from_fn, is_return_type_base_model_or_instance, @@ -8,6 +10,10 @@ from instructor.distil import ( # Replace `your_module_name` with your actual module name +instructions = Instructions( + name="test_distil", +) + class SimpleModel(BaseModel): data: int @@ -47,7 +53,7 @@ def test_format_function(): def test_distil_decorator_without_arguments(): - @distil + @instructions.distil def test_func(x: int) -> SimpleModel: return SimpleModel(data=x) @@ -56,7 +62,7 @@ def test_distil_decorator_without_arguments(): def test_distil_decorator_with_name_argument(): - @distil(name="custom_name") + @instructions.distil(name="custom_name") def another_test_func(x: int) -> SimpleModel: return SimpleModel(data=x) @@ -67,3 +73,11 @@ def test_distil_decorator_with_name_argument(): # Mock track function for decorator tests def mock_track(*args, **kwargs): pass + + +def fn(a: int, b: int) -> int: + return openai.ChatCompletion.create( + messages=[], + model="davinci", + response_model=SimpleModel, + ) From e98f777ef5d2a5d205ce6c02bc81232139397f34 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 15:13:05 -0400 Subject: [PATCH 03/19] clean up distil example --- examples/distilations/math_finetunes.jsonl | 10 +++ examples/distilations/three_digit_mul.py | 79 ++++++++++++++++++++++ examples/distilations/user_code_global.py | 70 ------------------- instructor/distil.py | 42 +++++++----- 4 files changed, 115 insertions(+), 86 deletions(-) create mode 100644 examples/distilations/math_finetunes.jsonl create mode 100644 examples/distilations/three_digit_mul.py delete mode 100644 examples/distilations/user_code_global.py diff --git a/examples/distilations/math_finetunes.jsonl b/examples/distilations/math_finetunes.jsonl new file mode 100644 index 0000000..7bb6a2b --- /dev/null +++ b/examples/distilations/math_finetunes.jsonl @@ -0,0 +1,10 @@ +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(396, b=414, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":396,\"b\":414,\"result\":810}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(812, b=108, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":812,\"b\":108,\"result\":920}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(923, b=561, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":923,\"b\":561,\"result\":1484}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(933, b=756, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":933,\"b\":756,\"result\":1689}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(807, b=389, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":807,\"b\":389,\"result\":1196}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(509, b=776, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":509,\"b\":776,\"result\":1285}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(165, b=442, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":165,\"b\":442,\"result\":607}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(843, b=304, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":843,\"b\":304,\"result\":1147}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(512, b=791, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":512,\"b\":791,\"result\":1303}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(133, b=539, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":133,\"b\":539,\"result\":672}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} \ No newline at end of file diff --git a/examples/distilations/three_digit_mul.py b/examples/distilations/three_digit_mul.py new file mode 100644 index 0000000..849f3b1 --- /dev/null +++ b/examples/distilations/three_digit_mul.py @@ -0,0 +1,79 @@ +import logging + +from pydantic import BaseModel +from instructor.distil import Instructions + +logging.basicConfig(level=logging.INFO) + +# Usage +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + log_handlers=[ + logging.FileHandler("math_finetunes.jsonl"), + ], +) + + +class Response(BaseModel): + a: int + b: int + result: int + + +@instructions.distil +def fn(a: int, b: int, c: str) -> Response: + """_summary_ + + Args: + a (int): _description_ + b (int): _description_ + c (str): _description_ + + Returns: + Response: _description_ + """ + resp = a + b + return Response(a=a, b=b, result=resp) + + +if __name__ == "__main__": + import random + + # A log will look like this: + log_line = { + "messages": [ + { + "role": "system", + "content": 'Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n"""\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n"""', + }, + {"role": "user", "content": 'Return fn(133, b=539, c="hello")'}, + { + "role": "assistant", + "function_call": { + "name": "Response", + "arguments": '{"a":133,"b":539,"result":672}', + }, + }, + ], + "functions": [ + { + "name": "Response", + "description": "Correctly extracted `Response` with all the required parameters with correct types", + "parameters": { + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + "result": {"type": "integer"}, + }, + "required": ["a", "b", "result"], + "type": "object", + }, + } + ], + } + + for _ in range(10): + a = random.randint(100, 999) + b = random.randint(100, 999) + print("returning", fn(a, b=b, c="hello")) diff --git a/examples/distilations/user_code_global.py b/examples/distilations/user_code_global.py deleted file mode 100644 index 0b433bb..0000000 --- a/examples/distilations/user_code_global.py +++ /dev/null @@ -1,70 +0,0 @@ -import datetime -import json -import os -import uuid -import requests -import logging - -from pydantic import BaseModel - -from instructor.distil import Instructions - - -class DatasetHandler(logging.Handler): - def __init__(self, dataset_name=None): - super().__init__() - self.url = os.environ.get("INSTRUCTOR_URL") - self.api_key = os.environ.get("INSTRUCTOR_KEY") - self.formatter = logging.Formatter("%(message)s") - self.dataset_name = dataset_name - self.uuid = str(uuid.uuid4()) - - def emit(self, record: logging.LogRecord) -> None: - log_entry = json.loads(self.format(record)) - - new_entry = { - "record": log_entry, - "dataset_name": self.dataset_name, - "batch_id": self.uuid, - "created_at": datetime.datetime.now().isoformat(), - } - - try: - response = requests.post(self.url, data=json.dumps(new_entry)) - if response.status_code != 200: - self.handleError(record) - except Exception: - self.handleError(record) - - -logging.basicConfig(level=logging.INFO) - -# Usage -instructions = Instructions( - name="test_distil", - log_handlers=[ - logging.FileHandler("finetunes.jsonl"), - DatasetHandler("finetunes_fo_test_distil"), - ], -) - - -class Response(BaseModel): - a: int - b: int - result: int - - -@instructions.distil -def fn(a: int, b: int) -> Response: - resp = a + b - return Response(a=a, b=b, result=resp) - - -if __name__ == "__main__": - import random - - for _ in range(10): - a = random.randint(100, 999) - b = random.randint(100, 999) - print("returning", fn(a, b)) diff --git a/instructor/distil.py b/instructor/distil.py index 7b9c4be..d17f359 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -3,7 +3,6 @@ import functools import inspect import json import logging -import os from typing import Any, Callable, List, Optional import uuid @@ -79,10 +78,12 @@ class Instructions: name: str = None, id: str = None, log_handlers: List[logging.Handler] = None, + finetune_format: FinetuneFormat = FinetuneFormat.MESSAGES, ): self.name = name self.id = id or str(uuid.uuid4()) self.unique_id = str(uuid.uuid4()) + self.finetune_format = finetune_format self.logger = logging.getLogger(self.name) for handler in log_handlers or []: @@ -93,7 +94,7 @@ class Instructions: *args, name: str = None, mode: str = "distil", - fine_tune_format: FinetuneFormat = FinetuneFormat.MESSAGES, + fine_tune_format: FinetuneFormat = None, ): """ Decorator to track the function call and response, supports distillation and dispatch modes. @@ -118,6 +119,9 @@ class Instructions: assert mode in allowed_modes, f"Must be in {allowed_modes}" assert mode == "distil", "Only distil mode is supported at the moment." + if fine_tune_format is None: + fine_tune_format = self.finetune_format + def _wrap_distil(fn): msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" assert is_return_type_base_model_or_instance(fn), msg @@ -161,30 +165,25 @@ class Instructions: name = name if name else fn.__name__ base_model: BaseModel = type(resp) - if finetune_format == FinetuneFormat.RAW: - function_body = dict( - fn_name=name, - fn_repr=format_function(fn), - args=args, - kwargs=kwargs, - resp=resp.model_dump(), - schema=base_model.model_json_schema(), - ) - self.logger.info(json.dumps(function_body)) - if finetune_format == FinetuneFormat.MESSAGES: - # This is the format that OpenAI's API expects for a finetune call openai_function_call = openai_schema(base_model).openai_schema function_definition = get_signature_from_fn(fn).replace(fn.__name__, name) + + str_args = ", ".join(map(str, args)) + str_kwargs = ( + ", ".join(f"{k}={json.dumps(v)}" for k, v in kwargs.items()) or None + ) + call_args = ", ".join(filter(None, [str_args, str_kwargs])) + function_body = { "messages": [ { "role": "system", - "content": f"Return the response from the function call.\n\n {function_definition}", + "content": f"Predict the results of this function:\n\n{function_definition}", }, { "role": "user", - "content": f"Return the results of the function with the following arguments:\n\n {name}(*{args}, **{kwargs})", + "content": f"Return {name}({call_args})", }, { "role": "assistant", @@ -197,3 +196,14 @@ class Instructions: "functions": [openai_function_call], } self.logger.info(json.dumps(function_body)) + + if finetune_format == FinetuneFormat.RAW: + function_body = dict( + fn_name=name, + fn_repr=format_function(fn), + args=args, + kwargs=kwargs, + resp=resp.model_dump(), + schema=base_model.model_json_schema(), + ) + self.logger.info(json.dumps(function_body)) From 08b4ca02e1cc3a0848269f54f30a69c02ece7f8e Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 15:38:57 -0400 Subject: [PATCH 04/19] clean up --- examples/distilations/three_digit_mul.py | 2 +- instructor/distil.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/distilations/three_digit_mul.py b/examples/distilations/three_digit_mul.py index 849f3b1..1d3f34c 100644 --- a/examples/distilations/three_digit_mul.py +++ b/examples/distilations/three_digit_mul.py @@ -1,7 +1,7 @@ import logging from pydantic import BaseModel -from instructor.distil import Instructions +from instructor import Instructions logging.basicConfig(level=logging.INFO) diff --git a/instructor/distil.py b/instructor/distil.py index d17f359..c69cdb3 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -79,11 +79,13 @@ class Instructions: id: str = None, log_handlers: List[logging.Handler] = None, finetune_format: FinetuneFormat = FinetuneFormat.MESSAGES, + indent: int = 2, ): self.name = name self.id = id or str(uuid.uuid4()) self.unique_id = str(uuid.uuid4()) self.finetune_format = finetune_format + self.indent = indent self.logger = logging.getLogger(self.name) for handler in log_handlers or []: @@ -167,7 +169,7 @@ class Instructions: if finetune_format == FinetuneFormat.MESSAGES: openai_function_call = openai_schema(base_model).openai_schema - function_definition = get_signature_from_fn(fn).replace(fn.__name__, name) + func_def = get_signature_from_fn(fn).replace(fn.__name__, name) str_args = ", ".join(map(str, args)) str_kwargs = ( @@ -179,17 +181,17 @@ class Instructions: "messages": [ { "role": "system", - "content": f"Predict the results of this function:\n\n{function_definition}", + "content": f"Predict the results of this function:\n\n{func_def}", }, { "role": "user", - "content": f"Return {name}({call_args})", + "content": f"Return `{name}({call_args})`", }, { "role": "assistant", "function_call": { "name": openai_function_call["name"], - "arguments": resp.model_dump_json(), + "arguments": resp.model_dump_json(indent=self.indent), }, }, ], From 2bb47034c20de1e6b85fb28c135279d224ceb4fd Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:00:36 -0400 Subject: [PATCH 05/19] add content --- docs/blog/posts/distilation-part1.md | 84 +++++++++++++++++++++++++ docs/distilation.md | 93 ++++++++++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 178 insertions(+) create mode 100644 docs/blog/posts/distilation-part1.md create mode 100644 docs/distilation.md diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md new file mode 100644 index 0000000..22508e7 --- /dev/null +++ b/docs/blog/posts/distilation-part1.md @@ -0,0 +1,84 @@ +--- +draft: False +date: 2023-10-17 +tags: + - RAG + - Finetuning +--- + +# Introduction to `Instructions` from `Instructor`, finetuning from Python functions. + +The core philosophy with the `instructor` library is to make language models backwards compatible with existing code. By adding Pydantic in the mix we're able to easily work with LLMs without much worry. + +However, many times, a single function isn't just one LLM call. After the results are returned theres [validation](/docs/validation.md), some additional processing and formatting before you `return` the result. + +But the promise of LLMs is that they can do all of this in one go. So how do we get there? Finetuning end to end is a great tool for enhancing language models. Instructor uses type hints via Pydantic to maintain backward compatibility. Distillation focuses on fine-tuning language models to imitate specific functions. + +## Challenges in Fine-tuning + +Fine-tuning a model isn't as straightforward as just writing `def f(a, b): return a * b` to teach a model three-digit multiplication. Substantial data preparation is required, making logging for data collection cumbersome. Luckily OpenAI not only provides a fine-tuning script but also one for function calling which simplies the process backed to structured outputs! More over, the finetune allows us to avoid passing the schema to the model, resulting in less tokens being used! + +## Role of Instructor in Easing the Process + +The feature `from instructor import Instructions` simplifies this. It decorates Python functions that return Pydantic objects, automatically creating a fine-tuning dataset when provided a handler for logging. This allows you to finetune a model to imitate a function's behavior. + +## How to Use Instructor's Distillation Feature + +Here's an example to illustrate its use: + +```python +import logging +from pydantic import BaseModel +from instructor import Instructions + +logging.basicConfig(level=logging.INFO) + +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + log_handlers=[logging.FileHandler("math_finetunes.jsonl")] +) + +class Response(BaseModel): + a: int + b: int + result: int + +@instructions.distil +def fn(a: int, b: int) -> Response: + resp = a + b + return Response(a=a, b=b, result=resp) + +for _ in range(10): + a = random.randint(100, 999) + b = random.randint(100, 999) + print(fn(a, b)) +``` + +## Why Instructor and Distillation are Useful + +1. Streamlines complex functions with validations, making them more efficient. +2. Facilitates the integration of classical machine learning with language models. + +By understanding and leveraging these capabilities, you can create powerful, fine-tuned language models with ease. To learn more about how to use the file to finetune a model, check out the [cli](/docs/cli/finetune.md) + +## Next Steps + +This post is mostly a peek of what I've been working on this week. Once we have a model trained I'd like to be able to dynamically swap the implemetnation of a function with a model. This would allow us to do things like: + +```python +from instructor import Instructions + +instructions = Instructions( + name="three_digit_multiply", +) + +@instructions.distil(model='gpt-3.5-turbo:finetuned', swap=True) +def fn(a: int, b: int) -> Response: + resp = a + b + return Response(a=a, b=b, result=resp) +``` + +Now we can swap out the implementation of `fn` with calling the finetuned model, since we know the response type is still `Response` we can use instructor behind the scenes and have it be backwards compatible with the existing code. + +This is a powerful idea, and I'm excited to see where it goes. \ No newline at end of file diff --git a/docs/distilation.md b/docs/distilation.md new file mode 100644 index 0000000..796f6e6 --- /dev/null +++ b/docs/distilation.md @@ -0,0 +1,93 @@ +# Distilling python functions into LLM + +`Instructions` from the `Instructor` library offers a seamless way to make language models backward compatible with existing Python functions. By employing Pydantic type hints, it not only ensures compatibility but also facilitates fine-tuning language models to emulate these functions end-to-end. + +## The Challenges in Function-Level Fine-Tuning + +Unlike simple script-level fine-tuning, replicating the behavior of a Python function in a language model involves intricate data preparation. For instance, teaching a model to execute three-digit multiplication is not as trivial as implementing `def f(a, b): return a * b`. OpenAI's fine-tuning script coupled with their function calling utility provides a structured output, thereby simplifying the data collection process. Additionally, this eliminates the need for passing the schema to the model, thus conserving tokens. + +## The Role of `Instructions` in Simplifying the Fine-Tuning Process + +By using `Instructions`, you can annotate a Python function that returns a Pydantic object, thereby automating the dataset creation for fine-tuning. A handler for logging is all that's needed to build this dataset. + +## How to Implement `Instructions` in Your Code + +Here's a step-by-step example: + +```python +import logging +from pydantic import BaseModel +from instructor import Instructions + +logging.basicConfig(level=logging.INFO) + +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + log_handlers=[logging.FileHandler("math_finetunes.jsonl")] +) + +class Response(BaseModel): + a: int + b: int + result: int + +@instructions.distil +def fn(a: int, b: int) -> Response: + resp = a + b + return Response(a=a, b=b, result=resp) +``` + +## Custom Log Handlers for Data Collection + +While the example above uses a file-based log handler, you can easily extend this to custom log handlers for different storage solutions. The following skeleton code illustrates how to create a log handler for an S3 bucket: + +```python +import logging +import boto3 + +class S3LogHandler(logging.Handler): + def __init__(self, bucket, key): + logging.Handler.__init__(self) + self.bucket = bucket + self.key = key + + def emit(self, record): + s3 = boto3.client('s3') + log_entry = self.format(record) + s3.put_object(Body=log_entry, Bucket=self.bucket, Key=self.key) +``` + +You can add this custom log handler to `Instructions` as shown: + +```python +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + log_handlers=[S3LogHandler(bucket='your-bucket', key='your-key')] +) +``` + +## Why `Instructions` is a Game-Changer + +1. It condenses complex, multi-step functions with validations into a single fine-tuned model. +2. It integrates language models with classical machine learning seamlessly. + +## Next Steps and Future Scope + +Going forward, the aim is to dynamically switch between the Python function and its fine-tuned model representation. This could look like: + +```python +from instructor import Instructions + +instructions = Instructions( + name="three_digit_multiply", +) + +@instructions.distil(model='gpt-3.5-turbo:finetuned', swap=True) +def fn(a: int, b: int) -> Response: + resp = a + b + return Response(a=a, b=b, result=resp) +``` + +This dynamic switching retains backward compatibility while improving efficiency, opening up exciting avenues for future developments. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index d386c21..447f7a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,6 +56,7 @@ nav: - Introduction: - Getting Started: 'index.md' - Prompt Engineering Tips: 'tips/index.md' + - Distillation: 'distillation.md' - Helpers: - Reasking and Validation Overview: "reask_validation.md" - Multiple Extractions: "multitask.md" From bc2c6f9fa69b59101ad34d5df4c19583d2df04b5 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:04:10 -0400 Subject: [PATCH 06/19] add check --- instructor/distil.py | 3 +++ tests/test_distil.py | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/instructor/distil.py b/instructor/distil.py index c69cdb3..179a9fb 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -69,6 +69,9 @@ def is_return_type_base_model_or_instance(func: Callable[..., Any]) -> bool: :return: True if the return type is a pydantic BaseModel or an instance of it. """ return_type = inspect.signature(func).return_annotation + assert ( + return_type != inspect.Signature.empty + ), "Must have a return type hint that is a pydantic BaseModel" return inspect.isclass(return_type) and issubclass(return_type, BaseModel) diff --git a/tests/test_distil.py b/tests/test_distil.py index beed033..474cf1d 100644 --- a/tests/test_distil.py +++ b/tests/test_distil.py @@ -1,4 +1,4 @@ -from pyexpat import model +import pytest import openai from pydantic import BaseModel from instructor.distil import ( @@ -19,6 +19,22 @@ class SimpleModel(BaseModel): data: int +def test_must_have_hint(): + with pytest.raises(AssertionError): + + @instructions.distil + def test_func(x: int): + return SimpleModel(data=x) + + +def test_must_be_base_model(): + with pytest.raises(AssertionError): + + @instructions.distil + def test_func(x) -> int: + return SimpleModel(data=x) + + def test_is_return_type_base_model_or_instance(): def valid_function() -> SimpleModel: return SimpleModel(data=1) From d454ef34e8792925a67e283c3605aecb8b20a94c Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:10:10 -0400 Subject: [PATCH 07/19] correct --- docs/blog/posts/distilation-part1.md | 37 ++++++++++++++++++---- examples/distilations/math_finetunes.jsonl | 20 ++++++------ examples/distilations/three_digit_mul.py | 10 +++--- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index 22508e7..c15b941 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -39,15 +39,15 @@ instructions = Instructions( log_handlers=[logging.FileHandler("math_finetunes.jsonl")] ) -class Response(BaseModel): +class Multiply(BaseModel): a: int b: int result: int @instructions.distil -def fn(a: int, b: int) -> Response: - resp = a + b - return Response(a=a, b=b, result=resp) +def fn(a: int, b: int) -> Multiply: + resp = a * b + return Multiply(a=a, b=b, result=resp) for _ in range(10): a = random.randint(100, 999) @@ -55,8 +55,31 @@ for _ in range(10): print(fn(a, b)) ``` +## Logging output + +```python +{ + "messages": [ + {"role": "system", "content": 'Predict the results of this function: ...'}, + {"role": "user", "content": 'Return fn(133, b=539)'}, + {"role": "assistant", + "function_call": + { + "name": "Multiply", + "arguments": '{"a":133,"b":539,"result":89509}' + } + } + ], + "functions": [ + {"name": "Multiply", "description": "Correctly extracted `Multiply`..."} + ] +} +``` + ## Why Instructor and Distillation are Useful +Many systems are not as simple as a single `openai.ChatCompletion.create` call, instead we often create objects, do additional processing, validation, error correction, and then return the result. This is a lot of work, and it's easy to make mistakes. Instructor's `distil` feature makes this process easier by: + 1. Streamlines complex functions with validations, making them more efficient. 2. Facilitates the integration of classical machine learning with language models. @@ -74,11 +97,11 @@ instructions = Instructions( ) @instructions.distil(model='gpt-3.5-turbo:finetuned', swap=True) -def fn(a: int, b: int) -> Response: +def fn(a: int, b: int) -> Multiply: resp = a + b - return Response(a=a, b=b, result=resp) + return Multiply(a=a, b=b, result=resp) ``` -Now we can swap out the implementation of `fn` with calling the finetuned model, since we know the response type is still `Response` we can use instructor behind the scenes and have it be backwards compatible with the existing code. +Now we can swap out the implementation of `fn` with calling the finetuned model, since we know the response type is still `Multiply` we can use instructor behind the scenes and have it be backwards compatible with the existing code. This is a powerful idea, and I'm excited to see where it goes. \ No newline at end of file diff --git a/examples/distilations/math_finetunes.jsonl b/examples/distilations/math_finetunes.jsonl index 7bb6a2b..64961a3 100644 --- a/examples/distilations/math_finetunes.jsonl +++ b/examples/distilations/math_finetunes.jsonl @@ -1,10 +1,10 @@ -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(396, b=414, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":396,\"b\":414,\"result\":810}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(812, b=108, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":812,\"b\":108,\"result\":920}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(923, b=561, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":923,\"b\":561,\"result\":1484}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(933, b=756, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":933,\"b\":756,\"result\":1689}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(807, b=389, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":807,\"b\":389,\"result\":1196}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(509, b=776, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":509,\"b\":776,\"result\":1285}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(165, b=442, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":165,\"b\":442,\"result\":607}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(843, b=304, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":843,\"b\":304,\"result\":1147}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(512, b=791, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":512,\"b\":791,\"result\":1303}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(133, b=539, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":133,\"b\":539,\"result\":672}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} \ No newline at end of file +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(540, b=677, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 540,\n \"b\": 677,\n \"result\": 1217\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(798, b=534, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 798,\n \"b\": 534,\n \"result\": 1332\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(608, b=669, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 608,\n \"b\": 669,\n \"result\": 1277\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(982, b=768, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 982,\n \"b\": 768,\n \"result\": 1750\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(994, b=682, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 994,\n \"b\": 682,\n \"result\": 1676\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(467, b=754, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 467,\n \"b\": 754,\n \"result\": 1221\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(497, b=364, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 497,\n \"b\": 364,\n \"result\": 861\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(840, b=821, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 840,\n \"b\": 821,\n \"result\": 1661\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(646, b=835, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 646,\n \"b\": 835,\n \"result\": 1481\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(926, b=196, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 926,\n \"b\": 196,\n \"result\": 1122\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} diff --git a/examples/distilations/three_digit_mul.py b/examples/distilations/three_digit_mul.py index 1d3f34c..74ccd69 100644 --- a/examples/distilations/three_digit_mul.py +++ b/examples/distilations/three_digit_mul.py @@ -1,6 +1,6 @@ import logging -from pydantic import BaseModel +from pydantic import BaseModel, Field from instructor import Instructions logging.basicConfig(level=logging.INFO) @@ -15,14 +15,14 @@ instructions = Instructions( ) -class Response(BaseModel): +class Multiply(BaseModel): a: int b: int - result: int + result: int = Field(..., description="The result of the multiplication") @instructions.distil -def fn(a: int, b: int, c: str) -> Response: +def fn(a: int, b: int, c: str) -> Multiply: """_summary_ Args: @@ -34,7 +34,7 @@ def fn(a: int, b: int, c: str) -> Response: Response: _description_ """ resp = a + b - return Response(a=a, b=b, result=resp) + return Multiply(a=a, b=b, result=resp) if __name__ == "__main__": From af1f3410f6c64c0ffbd24fd9b454f9369822e5ac Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:12:15 -0400 Subject: [PATCH 08/19] typos --- docs/blog/posts/distilation-part1.md | 3 ++- docs/distilation.md | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index c15b941..1b4b102 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -16,7 +16,7 @@ But the promise of LLMs is that they can do all of this in one go. So how do we ## Challenges in Fine-tuning -Fine-tuning a model isn't as straightforward as just writing `def f(a, b): return a * b` to teach a model three-digit multiplication. Substantial data preparation is required, making logging for data collection cumbersome. Luckily OpenAI not only provides a fine-tuning script but also one for function calling which simplies the process backed to structured outputs! More over, the finetune allows us to avoid passing the schema to the model, resulting in less tokens being used! +Fine-tuning a model isn't as straightforward as just writing `def f(a, b): return a * b` to teach a model three-digit multiplication. Substantial data preparation is required, making logging for data collection cumbersome. Luckily OpenAI not only provides a fine-tuning script but also one for function calling which simplies the process backed by structured outputs! More over, the finetune allows us to avoid passing the schema to the model, resulting in less tokens being used! ## Role of Instructor in Easing the Process @@ -28,6 +28,7 @@ Here's an example to illustrate its use: ```python import logging +import random from pydantic import BaseModel from instructor import Instructions diff --git a/docs/distilation.md b/docs/distilation.md index 796f6e6..e1c3dbc 100644 --- a/docs/distilation.md +++ b/docs/distilation.md @@ -27,15 +27,15 @@ instructions = Instructions( log_handlers=[logging.FileHandler("math_finetunes.jsonl")] ) -class Response(BaseModel): +class Multiply(BaseModel): a: int b: int result: int @instructions.distil -def fn(a: int, b: int) -> Response: +def fn(a: int, b: int) -> Multiply: resp = a + b - return Response(a=a, b=b, result=resp) + return Multiply(a=a, b=b, result=resp) ``` ## Custom Log Handlers for Data Collection @@ -85,9 +85,9 @@ instructions = Instructions( ) @instructions.distil(model='gpt-3.5-turbo:finetuned', swap=True) -def fn(a: int, b: int) -> Response: +def fn(a: int, b: int) -> Multiply: resp = a + b - return Response(a=a, b=b, result=resp) + return Multiply(a=a, b=b, result=resp) ``` This dynamic switching retains backward compatibility while improving efficiency, opening up exciting avenues for future developments. \ No newline at end of file From a194840dc60ee6ae78075583d95de097b071d76b Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:15:05 -0400 Subject: [PATCH 09/19] blog --- docs/blog/posts/distilation-part1.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index 1b4b102..52f18b9 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -2,8 +2,10 @@ draft: False date: 2023-10-17 tags: - - RAG - - Finetuning + - python + - distilation + - function calling + - tinetuning --- # Introduction to `Instructions` from `Instructor`, finetuning from Python functions. From 6d78cdf419b3359f0aaeda219e436d7b5f53b8d2 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 16:18:13 -0400 Subject: [PATCH 10/19] plug --- docs/blog/posts/distilation-part1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index 52f18b9..d2cf0d3 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -107,4 +107,4 @@ def fn(a: int, b: int) -> Multiply: Now we can swap out the implementation of `fn` with calling the finetuned model, since we know the response type is still `Multiply` we can use instructor behind the scenes and have it be backwards compatible with the existing code. -This is a powerful idea, and I'm excited to see where it goes. \ No newline at end of file +Now if you're thinking wow, I'd love a backend service to do this for continously, you're in luck! Please check out the survey at [useinstructor.com](https://useinstructor.com) and let us know who you are. \ No newline at end of file From b2562fcb9a07a1f1f2af36130adac3ca1de66a83 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 15 Oct 2023 14:54:45 -0400 Subject: [PATCH 11/19] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a3969b0..cdfd107 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "instructor" -version = "0.2.8" +version = "0.2.9" description = "Helper functions that allow us to improve openai's function_call ergonomics" authors = ["Jason "] license = "MIT" From 2c85154a433b96275ee06242c5000a98ca241cf6 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 17:41:11 -0400 Subject: [PATCH 12/19] clean up --- instructor/function_calls.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/instructor/function_calls.py b/instructor/function_calls.py index c759aae..6146ec1 100644 --- a/instructor/function_calls.py +++ b/instructor/function_calls.py @@ -27,16 +27,6 @@ from typing import Any, Callable from pydantic import BaseModel, create_model, validate_arguments -def _remove_a_key(d, remove_key) -> None: - """Remove a key from a dictionary recursively""" - if isinstance(d, dict): - for key in list(d.keys()): - if key == remove_key and "type" in d.keys(): - del d[key] - else: - _remove_a_key(d[key], remove_key) - - class openai_function: """ Decorator to convert a function into an OpenAI function. @@ -82,8 +72,6 @@ class openai_function: parameters["required"] = sorted( k for k, v in parameters["properties"].items() if not "default" in v ) - _remove_a_key(parameters, "additionalProperties") - _remove_a_key(parameters, "title") self.openai_schema = { "name": self.func.__name__, "description": self.docstring.short_description, From 08d08aef260b58c5f70a411570b1d15ae3a2e260 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 17:45:16 -0400 Subject: [PATCH 13/19] clean up --- instructor/distil.py | 64 ++++++++++++++++++++---------------- instructor/function_calls.py | 2 -- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/instructor/distil.py b/instructor/distil.py index 179a9fb..94f02c8 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -171,36 +171,17 @@ class Instructions: base_model: BaseModel = type(resp) if finetune_format == FinetuneFormat.MESSAGES: - openai_function_call = openai_schema(base_model).openai_schema - func_def = get_signature_from_fn(fn).replace(fn.__name__, name) - - str_args = ", ".join(map(str, args)) - str_kwargs = ( - ", ".join(f"{k}={json.dumps(v)}" for k, v in kwargs.items()) or None + openai_kwargs = self.openai_kwargs(name, fn, args, kwargs, base_model) + openai_kwargs.append( + { + "role": "assistant", + "function_call": { + "name": base_model.__name__, + "arguments": resp.model_dump_json(indent=self.indent), + }, + } ) - call_args = ", ".join(filter(None, [str_args, str_kwargs])) - - function_body = { - "messages": [ - { - "role": "system", - "content": f"Predict the results of this function:\n\n{func_def}", - }, - { - "role": "user", - "content": f"Return `{name}({call_args})`", - }, - { - "role": "assistant", - "function_call": { - "name": openai_function_call["name"], - "arguments": resp.model_dump_json(indent=self.indent), - }, - }, - ], - "functions": [openai_function_call], - } - self.logger.info(json.dumps(function_body)) + self.logger.info(json.dumps(openai_kwargs)) if finetune_format == FinetuneFormat.RAW: function_body = dict( @@ -212,3 +193,28 @@ class Instructions: schema=base_model.model_json_schema(), ) self.logger.info(json.dumps(function_body)) + + def openai_kwargs(self, name, fn, args, kwargs, base_model): + openai_function_call = openai_schema(base_model).openai_schema + func_def = get_signature_from_fn(fn).replace(fn.__name__, name) + + str_args = ", ".join(map(str, args)) + str_kwargs = ( + ", ".join(f"{k}={json.dumps(v)}" for k, v in kwargs.items()) or None + ) + call_args = ", ".join(filter(None, [str_args, str_kwargs])) + + function_body = { + "messages": [ + { + "role": "system", + "content": f"Predict the results of this function:\n\n{func_def}", + }, + { + "role": "user", + "content": f"Return `{name}({call_args})`", + }, + ], + "functions": [openai_function_call], + } + return function_body diff --git a/instructor/function_calls.py b/instructor/function_calls.py index 6146ec1..5ef779b 100644 --- a/instructor/function_calls.py +++ b/instructor/function_calls.py @@ -188,8 +188,6 @@ class OpenAISchema(BaseModel): f"the required parameters with correct types" ) - _remove_a_key(parameters, "title") - _remove_a_key(parameters, "additionalProperties") return { "name": schema["title"], "description": schema["description"], From 6f2e3d56c624acae5439d07d09690c3f2e3b54cd Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 17:46:25 -0400 Subject: [PATCH 14/19] clean up tests --- instructor/distil.py | 2 +- tests/test_multitask.py | 67 +++-------------------------------------- 2 files changed, 5 insertions(+), 64 deletions(-) diff --git a/instructor/distil.py b/instructor/distil.py index 94f02c8..894764a 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -172,7 +172,7 @@ class Instructions: if finetune_format == FinetuneFormat.MESSAGES: openai_kwargs = self.openai_kwargs(name, fn, args, kwargs, base_model) - openai_kwargs.append( + openai_kwargs["messages"].append( { "role": "assistant", "function_call": { diff --git a/tests/test_multitask.py b/tests/test_multitask.py index 9e24949..04bd014 100644 --- a/tests/test_multitask.py +++ b/tests/test_multitask.py @@ -10,67 +10,8 @@ def test_multi_task(): query: str multitask = MultiTask(Search) - assert multitask.openai_schema == { - "description": "Correct segmentation of `Search` tasks", - "name": "MultiSearch", - "parameters": { - "$defs": { - "Search": { - "properties": { - "id": {"type": "integer"}, - "query": {"type": "string"}, - }, - "required": ["id", "query"], - "description": "This is the search docstring", - "type": "object", - } - }, - "properties": { - "tasks": { - "description": "Correctly segmented list of `Search` tasks", - "items": {"$ref": "#/$defs/Search"}, - "type": "array", - } - }, - "required": ["tasks"], - "type": "object", - }, - } - - -def test_multi_task_with_name_and_desc(): - class Search(OpenAISchema): - """This is the search docstring""" - - id: int - query: str - - multitask = MultiTask( - subtask_class=Search, name="MyCustomName", description="MyCustomDesc" + assert multitask.openai_schema["name"] == "MultiSearch" + assert ( + multitask.openai_schema["description"] + == "Correct segmentation of `Search` tasks" ) - assert multitask.openai_schema == { - "description": "MyCustomDesc", - "name": "MultiMyCustomName", - "parameters": { - "$defs": { - "Search": { - "properties": { - "id": {"type": "integer"}, - "query": {"type": "string"}, - }, - "required": ["id", "query"], - "description": "This is the search docstring", - "type": "object", - } - }, - "properties": { - "tasks": { - "description": "Correctly segmented list of `MyCustomName` tasks", - "items": {"$ref": "#/$defs/Search"}, - "type": "array", - } - }, - "required": ["tasks"], - "type": "object", - }, - } From edf29482e4d27cb7323e6c0e8600199c2f81f527 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 17:47:26 -0400 Subject: [PATCH 15/19] include code body --- instructor/distil.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/instructor/distil.py b/instructor/distil.py index 894764a..5d52972 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -83,12 +83,14 @@ class Instructions: log_handlers: List[logging.Handler] = None, finetune_format: FinetuneFormat = FinetuneFormat.MESSAGES, indent: int = 2, + include_code_body: bool = False, ): self.name = name self.id = id or str(uuid.uuid4()) self.unique_id = str(uuid.uuid4()) self.finetune_format = finetune_format self.indent = indent + self.include_code_body = include_code_body self.logger = logging.getLogger(self.name) for handler in log_handlers or []: @@ -196,7 +198,11 @@ class Instructions: def openai_kwargs(self, name, fn, args, kwargs, base_model): openai_function_call = openai_schema(base_model).openai_schema - func_def = get_signature_from_fn(fn).replace(fn.__name__, name) + + if self.include_code_body: + func_def = format_function(fn) + else: + func_def = get_signature_from_fn(fn).replace(fn.__name__, name) str_args = ", ".join(map(str, args)) str_kwargs = ( From 7119d1825798e2fd7e8aea30c86f4ddf923a9209 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 17:54:19 -0400 Subject: [PATCH 16/19] implement dispatch --- instructor/distil.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/instructor/distil.py b/instructor/distil.py index 5d52972..3ea844f 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -4,10 +4,12 @@ import inspect import json import logging -from typing import Any, Callable, List, Optional -import uuid +from typing import Any, Callable, List, Optional, Type from pydantic import BaseModel, validate_call +import uuid +import openai + from instructor import openai_schema @@ -85,6 +87,16 @@ class Instructions: indent: int = 2, include_code_body: bool = False, ): + """ + Instructions for distillation and dispatch. + + :param name: Name of the instructions. + :param id: ID of the instructions. + :param log_handlers: List of log handlers to use. + :param finetune_format: Format to use for finetuning. + :param indent: Indentation to use for finetuning. + :param include_code_body: Whether to include the code body in the finetuning. + """ self.name = name self.id = id or str(uuid.uuid4()) self.unique_id = str(uuid.uuid4()) @@ -132,6 +144,20 @@ class Instructions: def _wrap_distil(fn): msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" assert is_return_type_base_model_or_instance(fn), msg + return_base_model = inspect.signature(fn).return_annotation + + @functools.wraps(fn) + def _dispatch(*args, **kwargs): + openai_kwargs = self.openai_kwargs( + name=name, + fn=fn, + args=args, + kwargs=kwargs, + base_model=return_base_model, + ) + return openai.ChatCompletion.create( + **openai_kwargs, response_model=return_base_model + ) @functools.wraps(fn) def _distil(*args, **kwargs): @@ -142,7 +168,11 @@ class Instructions: return resp - return _distil + if mode == "dispatch": + return _dispatch + + if mode == "distil": + return _distil if len(args) == 1 and callable(args[0]): return _wrap_distil(args[0]) @@ -173,6 +203,7 @@ class Instructions: base_model: BaseModel = type(resp) if finetune_format == FinetuneFormat.MESSAGES: + openai_function_call = openai_schema(base_model).openai_schema openai_kwargs = self.openai_kwargs(name, fn, args, kwargs, base_model) openai_kwargs["messages"].append( { @@ -183,6 +214,8 @@ class Instructions: }, } ) + openai_kwargs["functions"] = [openai_function_call] + openai_kwargs["function_call"] = {"name": openai_function_call["name"]} self.logger.info(json.dumps(openai_kwargs)) if finetune_format == FinetuneFormat.RAW: @@ -197,8 +230,6 @@ class Instructions: self.logger.info(json.dumps(function_body)) def openai_kwargs(self, name, fn, args, kwargs, base_model): - openai_function_call = openai_schema(base_model).openai_schema - if self.include_code_body: func_def = format_function(fn) else: @@ -221,6 +252,5 @@ class Instructions: "content": f"Return `{name}({call_args})`", }, ], - "functions": [openai_function_call], } return function_body From b57edaaf6a96be0370e1e3d564b1359eeca09de0 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 19:13:16 -0400 Subject: [PATCH 17/19] update --- examples/distilations/math_finetunes.jsonl | 20 ++++---- examples/distilations/three_digit_mul.py | 43 +++++++--------- .../distilations/three_digit_mul_dispatch.py | 49 +++++++++++++++++++ instructor/distil.py | 7 ++- 4 files changed, 80 insertions(+), 39 deletions(-) create mode 100644 examples/distilations/three_digit_mul_dispatch.py diff --git a/examples/distilations/math_finetunes.jsonl b/examples/distilations/math_finetunes.jsonl index 64961a3..2fc676c 100644 --- a/examples/distilations/math_finetunes.jsonl +++ b/examples/distilations/math_finetunes.jsonl @@ -1,10 +1,10 @@ -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(540, b=677, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 540,\n \"b\": 677,\n \"result\": 1217\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(798, b=534, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 798,\n \"b\": 534,\n \"result\": 1332\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(608, b=669, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 608,\n \"b\": 669,\n \"result\": 1277\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(982, b=768, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 982,\n \"b\": 768,\n \"result\": 1750\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(994, b=682, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 994,\n \"b\": 682,\n \"result\": 1676\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(467, b=754, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 467,\n \"b\": 754,\n \"result\": 1221\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(497, b=364, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 497,\n \"b\": 364,\n \"result\": 861\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(840, b=821, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 840,\n \"b\": 821,\n \"result\": 1661\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(646, b=835, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 646,\n \"b\": 835,\n \"result\": 1481\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} -{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Multiply\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return `fn(926, b=196, c=\"hello\")`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 926,\n \"b\": 196,\n \"result\": 1122\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"description": "The result of the multiplication", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(286, b=521)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 286,\n \"b\": 521,\n \"result\": 149006\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(809, b=358)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 809,\n \"b\": 358,\n \"result\": 289622\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(634, b=574)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 634,\n \"b\": 574,\n \"result\": 363916\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(736, b=276)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 736,\n \"b\": 276,\n \"result\": 203136\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(143, b=591)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 143,\n \"b\": 591,\n \"result\": 84513\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(929, b=123)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 929,\n \"b\": 123,\n \"result\": 114267\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(342, b=800)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 342,\n \"b\": 800,\n \"result\": 273600\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(887, b=718)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 887,\n \"b\": 718,\n \"result\": 636866\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(250, b=694)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 250,\n \"b\": 694,\n \"result\": 173500\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n\"\"\"\nReturn the result of multiplying a and b together\n\"\"\""}, {"role": "user", "content": "Return `fn(731, b=622)`"}, {"role": "assistant", "function_call": {"name": "Multiply", "arguments": "{\n \"a\": 731,\n \"b\": 622,\n \"result\": 454682\n}"}}], "functions": [{"name": "Multiply", "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": {"properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "integer"}, "result": {"description": "The result of the multiplication", "title": "Result", "type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} diff --git a/examples/distilations/three_digit_mul.py b/examples/distilations/three_digit_mul.py index 74ccd69..9c521ee 100644 --- a/examples/distilations/three_digit_mul.py +++ b/examples/distilations/three_digit_mul.py @@ -22,49 +22,43 @@ class Multiply(BaseModel): @instructions.distil -def fn(a: int, b: int, c: str) -> Multiply: - """_summary_ - - Args: - a (int): _description_ - b (int): _description_ - c (str): _description_ - - Returns: - Response: _description_ - """ - resp = a + b +def fn(a: int, b: int) -> Multiply: + """Return the result of multiplying a and b together""" + resp = a * b return Multiply(a=a, b=b, result=resp) if __name__ == "__main__": import random - # A log will look like this: - log_line = { + log_lines = { "messages": [ { "role": "system", - "content": 'Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n"""\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n"""', + "content": 'Predict the results of this function:\n\ndef fn(a: int, b: int) -> __main__.Multiply\n"""\nReturn the result of multiplying a and b together\n"""', }, - {"role": "user", "content": 'Return fn(133, b=539, c="hello")'}, + {"role": "user", "content": "Return `fn(169, b=166)`"}, { "role": "assistant", "function_call": { - "name": "Response", - "arguments": '{"a":133,"b":539,"result":672}', + "name": "Multiply", + "arguments": '{\n "a": 169,\n "b": 166,\n "result": 28054\n}', }, }, ], "functions": [ { - "name": "Response", - "description": "Correctly extracted `Response` with all the required parameters with correct types", + "name": "Multiply", + "description": "Correctly extracted `Multiply` with all the required parameters with correct types", "parameters": { "properties": { - "a": {"type": "integer"}, - "b": {"type": "integer"}, - "result": {"type": "integer"}, + "a": {"title": "A", "type": "integer"}, + "b": {"title": "B", "type": "integer"}, + "result": { + "description": "The result of the multiplication", + "title": "Result", + "type": "integer", + }, }, "required": ["a", "b", "result"], "type": "object", @@ -72,8 +66,7 @@ if __name__ == "__main__": } ], } - for _ in range(10): a = random.randint(100, 999) b = random.randint(100, 999) - print("returning", fn(a, b=b, c="hello")) + print("returning", fn(a, b=b)) diff --git a/examples/distilations/three_digit_mul_dispatch.py b/examples/distilations/three_digit_mul_dispatch.py new file mode 100644 index 0000000..1feb611 --- /dev/null +++ b/examples/distilations/three_digit_mul_dispatch.py @@ -0,0 +1,49 @@ +import logging + +from pydantic import BaseModel, Field +from instructor import Instructions +import instructor + +instructor.patch() + +logging.basicConfig(level=logging.INFO) + +# Usage +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + include_code_body=True, + log_handlers=[ + logging.FileHandler("math_finetunes.jsonl"), + ], +) + + +class Multiply(BaseModel): + a: int + b: int + result: int = Field(..., description="The result of the multiplication") + + +@instructions.distil(mode="dispatch", model="ft:gpt-3.5-turbo-0613:personal::8CazU0uq") +def fn(a: int, b: int) -> Multiply: + """Return the result of the multiplication as an integer""" + resp = a * b + return Multiply(a=a, b=b, result=resp) + + +if __name__ == "__main__": + import random + + for _ in range(5): + a = random.randint(100, 999) + b = random.randint(100, 999) + result = fn(a, b) + print(f"{a} * {b} = {result.result}, expected {a*b}") + """ + 972 * 508 = 493056, expected 493776 + 145 * 369 = 53505, expected 53505 + 940 * 440 = 413600, expected 413600 + 114 * 213 = 24282, expected 24282 + 259 * 650 = 168350, expected 168350 + """ diff --git a/instructor/distil.py b/instructor/distil.py index 3ea844f..bc4881f 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -113,6 +113,7 @@ class Instructions: *args, name: str = None, mode: str = "distil", + model: str = "gpt-3.5-turbo", fine_tune_format: FinetuneFormat = None, ): """ @@ -136,7 +137,6 @@ class Instructions: """ allowed_modes = {"distil", "dispatch"} assert mode in allowed_modes, f"Must be in {allowed_modes}" - assert mode == "distil", "Only distil mode is supported at the moment." if fine_tune_format is None: fine_tune_format = self.finetune_format @@ -156,7 +156,7 @@ class Instructions: base_model=return_base_model, ) return openai.ChatCompletion.create( - **openai_kwargs, response_model=return_base_model + **openai_kwargs, model=model, response_model=return_base_model ) @functools.wraps(fn) @@ -215,7 +215,6 @@ class Instructions: } ) openai_kwargs["functions"] = [openai_function_call] - openai_kwargs["function_call"] = {"name": openai_function_call["name"]} self.logger.info(json.dumps(openai_kwargs)) if finetune_format == FinetuneFormat.RAW: @@ -233,7 +232,7 @@ class Instructions: if self.include_code_body: func_def = format_function(fn) else: - func_def = get_signature_from_fn(fn).replace(fn.__name__, name) + func_def = get_signature_from_fn(fn) str_args = ", ".join(map(str, args)) str_kwargs = ( From 54fc83ffe047c1a860d429617c22442f42f8380e Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 19:22:22 -0400 Subject: [PATCH 18/19] blog update --- docs/blog/posts/distilation-part1.md | 71 ++++++++++++++++------------ 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index d2cf0d3..716003d 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -3,30 +3,24 @@ draft: False date: 2023-10-17 tags: - python - - distilation + - distillation - function calling - - tinetuning + - finetuning --- -# Introduction to `Instructions` from `Instructor`, finetuning from Python functions. +# Enhancing Python Functions with Instructor: A Guide to Fine-Tuning and Distillation -The core philosophy with the `instructor` library is to make language models backwards compatible with existing code. By adding Pydantic in the mix we're able to easily work with LLMs without much worry. +## Introduction -However, many times, a single function isn't just one LLM call. After the results are returned theres [validation](/docs/validation.md), some additional processing and formatting before you `return` the result. +Get ready to dive deep into the world of fine-tuning task specific language models with Python functions. We'll explore how the `instructor.instructions` streamlines this process, making the task you want to distil more efficient and powerful while preserving its original functionality and backwards compatibility. -But the promise of LLMs is that they can do all of this in one go. So how do we get there? Finetuning end to end is a great tool for enhancing language models. Instructor uses type hints via Pydantic to maintain backward compatibility. Distillation focuses on fine-tuning language models to imitate specific functions. +## Why You Need Instructor -## Challenges in Fine-tuning +Imagine you're developing a backend service that uses a mix old and new school ML practises, it may involve pipelines with multiple function calls, validations, and data processing. Sounds cumbersome, right? That's where `Instructor` comes in. It simplifies complex procedures, making them more efficient and easier to manage by adding a decorator to your function that will automatically generate a dataset for fine-tuning and help you swap out the function implementation. -Fine-tuning a model isn't as straightforward as just writing `def f(a, b): return a * b` to teach a model three-digit multiplication. Substantial data preparation is required, making logging for data collection cumbersome. Luckily OpenAI not only provides a fine-tuning script but also one for function calling which simplies the process backed by structured outputs! More over, the finetune allows us to avoid passing the schema to the model, resulting in less tokens being used! +## Quick Start: How to Use Instructor's Distillation Feature -## Role of Instructor in Easing the Process - -The feature `from instructor import Instructions` simplifies this. It decorates Python functions that return Pydantic objects, automatically creating a fine-tuning dataset when provided a handler for logging. This allows you to finetune a model to imitate a function's behavior. - -## How to Use Instructor's Distillation Feature - -Here's an example to illustrate its use: +Before we dig into the nitty-gritty, let's look at how easy it is to use Instructor's distillation feature to use function calling finetuning to export the data to a JSONL file. ```python import logging @@ -34,6 +28,7 @@ import random from pydantic import BaseModel from instructor import Instructions +# Logging setup logging.basicConfig(level=logging.INFO) instructions = Instructions( @@ -47,18 +42,38 @@ class Multiply(BaseModel): b: int result: int +# Define a function with distillation +# The decorator will automatically generate a dataset for fine-tuning +# They must return a pydantic model to leverage function calling @instructions.distil def fn(a: int, b: int) -> Multiply: resp = a * b return Multiply(a=a, b=b, result=resp) +# Generate some data for _ in range(10): a = random.randint(100, 999) b = random.randint(100, 999) print(fn(a, b)) ``` -## Logging output +## The Intricacies of Fine-tuning Language Models + +Fine-tuning isn't just about writing a function like `def f(a, b): return a * b`. It requires detailed data preparation and logging. However, Instructor provides a built-in logging feature and structured outputs to simplify this. + +## Why Instructor and Distillation are Game Changers + +The library offers two main benefits: + +1. **Efficiency**: Streamlines functions, distilling requirements into model weights and a few lines of code. +2. **Integration**: Eases combining classical machine learning and language models by providing a simple interface that wraps existing functions. + +## Role of Instructor in Simplifying Fine-Tuning + +The `from instructor import Instructions` feature is a time saver. It auto-generates a fine-tuning dataset, making it a breeze to imitate a function's behavior. + +## Logging Output and Running a Finetune +Here's how the logging output would look: ```python { @@ -79,18 +94,14 @@ for _ in range(10): } ``` -## Why Instructor and Distillation are Useful +Run a finetune like this: -Many systems are not as simple as a single `openai.ChatCompletion.create` call, instead we often create objects, do additional processing, validation, error correction, and then return the result. This is a lot of work, and it's easy to make mistakes. Instructor's `distil` feature makes this process easier by: +```bash +instructor jobs create-from-file math_finetunes.jsonl +``` -1. Streamlines complex functions with validations, making them more efficient. -2. Facilitates the integration of classical machine learning with language models. - -By understanding and leveraging these capabilities, you can create powerful, fine-tuned language models with ease. To learn more about how to use the file to finetune a model, check out the [cli](/docs/cli/finetune.md) - -## Next Steps - -This post is mostly a peek of what I've been working on this week. Once we have a model trained I'd like to be able to dynamically swap the implemetnation of a function with a model. This would allow us to do things like: +## Next Steps and Future Plans +Here's a sneak peek of what I'm planning: ```python from instructor import Instructions @@ -99,12 +110,14 @@ instructions = Instructions( name="three_digit_multiply", ) -@instructions.distil(model='gpt-3.5-turbo:finetuned', swap=True) +@instructions.distil(model='gpt-3.5-turbo:finetuned-123', mode="dispatch") def fn(a: int, b: int) -> Multiply: resp = a + b return Multiply(a=a, b=b, result=resp) ``` -Now we can swap out the implementation of `fn` with calling the finetuned model, since we know the response type is still `Multiply` we can use instructor behind the scenes and have it be backwards compatible with the existing code. +With this, you can swap the function implementation, making it backward compatible. You can even imagine using the different models for different tasks or validating and runnign evals by using the original function and comparing it to the distillation. -Now if you're thinking wow, I'd love a backend service to do this for continously, you're in luck! Please check out the survey at [useinstructor.com](https://useinstructor.com) and let us know who you are. \ No newline at end of file +## Conclusion + +We've seen how `Instructor` can make your life easier, from fine-tuning to distillation. Stay tuned for more updates and don't forget to check out the survey at [useinstructor.com](https://useinstructor.com). \ No newline at end of file From 01bf191eb0f73fe0497fcfd049976e9020a11bf0 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sun, 22 Oct 2023 19:22:41 -0400 Subject: [PATCH 19/19] distil --- docs/blog/posts/distilation-part1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md index 716003d..e006ce0 100644 --- a/docs/blog/posts/distilation-part1.md +++ b/docs/blog/posts/distilation-part1.md @@ -120,4 +120,4 @@ With this, you can swap the function implementation, making it backward compatib ## Conclusion -We've seen how `Instructor` can make your life easier, from fine-tuning to distillation. Stay tuned for more updates and don't forget to check out the survey at [useinstructor.com](https://useinstructor.com). \ No newline at end of file +We've seen how `Instructor` can make your life easier, from fine-tuning to distillation. Now if you're thinking wow, I'd love a backend service to do this for continously, you're in luck! Please check out the survey at [useinstructor.com](https://useinstructor.com) and let us know who you are. \ No newline at end of file