From 0d44023f213a8ca59b644036c3c3d2b07f6e489a Mon Sep 17 00:00:00 2001 From: Jason Date: Sat, 17 Jun 2023 23:02:06 +0900 Subject: [PATCH] recursive dataclass example with filesystem --- README.md | 7 ++ parse_recursive_paths.py | 176 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 parse_recursive_paths.py diff --git a/README.md b/README.md index 55ef48e..15796bb 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,13 @@ Each search query is defined by a `Search` class, consisting of a `title`, a `qu A request is then segmented into multiple search queries, by passing the request to the `segment` function. The function makes a call to the OpenAI API, instructing it to use the `MultiSearch` class to segment the request into multiple search queries. +### DirectoryTree and Recursive Classes + +The `DirectoryTree` and `Node` classes in this example illustrated an advanced usage in understanding and manipulating hierarchical data structures with recursive data types. The script includes functionality for parsing a string representation of a filesystem into a structured directory tree, with the ability to distinguish between file and folder nodes. + +This recursion is handled by wrapping it in the `DirectoryTree` class, which is non-recursive. + +This is because Pydantic, the library used to generate schemas for these classes, encounters limitations when handling recursive schemas. Therefore, the workaround implemented here is to wrap the recursive Node class in the non-recursive `DirectoryTree` class. ```python class MultiSearch(OpenAISchema): diff --git a/parse_recursive_paths.py b/parse_recursive_paths.py new file mode 100644 index 0000000..8ddabe0 --- /dev/null +++ b/parse_recursive_paths.py @@ -0,0 +1,176 @@ +""" +This script parses a string representation of a filesystem structure into a tree-like directory structure. + +The 'Node' class represents a node in this tree, which can be either a file or a folder. Files cannot have +children, while folders can. + +The 'DirectoryTree' class contains a single root folder from which all other files/folders can be reached. +The 'parse_tree_to_filesystem' function uses OpenAI's GPT-3 model to convert a string representation of a +directory tree into a 'DirectoryTree' object. This object can then be manipulated programmatically as needed, +with methods such as 'print_paths' available for convenience. + +Please note: Recursive models currently work if they are wrapped by a non-recursive one. This is why we are +passing a 'DirectoryTree' (which contains a single 'Node') as the function call, not a 'Node' directly. This +is due to a limitation in how Pydantic generates schemas for recursive objects, which creates +'dict_keys(['$ref', 'definitions'])'. Instead of writing a resolver for such references, we can simply wrap the +recursive class in a non-recursive one so the function_call class never has a cyclic reference. + +Example usage: +>>> root = parse_tree_to_filesystem( +... ''' +... root +... ├── folder1 +... │ ├── file1.txt +... │ └── file2.txt +... └── folder2 +... ├── file3.txt +... └── subfolder1 +... └── file4.txt +... ''' +... ) +>>> root.print_paths() +# Expected output: +# >>> root NodeType.FOLDER +# >>> root/folder1 NodeType.FOLDER +# >>> root/folder1/file1.txt NodeType.FILE +# >>> root/folder1/file2.txt NodeType.FILE +# >>> root/folder2 NodeType.FOLDER +# >>> root/folder2/file3.txt NodeType.FILE +# >>> root/folder2/subfolder1 NodeType.FOLDER +# >>> root/folder2/subfolder1/file4.txt NodeType.FILE +""" + +import openai +import enum + +from pydantic import Field +from typing import List +from openai_function_call import OpenAISchema +from tenacity import retry, stop_after_attempt + + +class NodeType(str, enum.Enum): + """Enumeration representing the types of nodes in a filesystem.""" + + FILE = "file" + FOLDER = "folder" + + +class Node(OpenAISchema): + """ + Class representing a single node in a filesystem. Can be either a file or a folder. + Note that a file cannot have children, but a folder can. + + Args: + name (str): The name of the node. + children (List[Node]): The list of child nodes (if any). + node_type (NodeType): The type of the node, either a file or a folder. + + Methods: + print_paths: Prints the path of the node and its children. + """ + + name: str = Field(..., description="Name of the folder") + children: List["Node"] = Field( + default_factory=list, + description="List of children nodes, only applicable for folders, files cannot have children", + ) + node_type: NodeType = Field( + default=NodeType.FILE, + description="Either a file or folder, use the name to determine which it could be", + ) + + def print_paths(self, parent_path=""): + """Prints the path of the node and its children.""" + + if self.node_type == NodeType.FOLDER: + path = f"{parent_path}/{self.name}" if parent_path != "" else self.name + + print(path, self.node_type) + + if self.children is not None: + for child in self.children: + child.print_paths(path) + else: + print(f"{parent_path}/{self.name}", self.node_type) + + +class DirectoryTree(OpenAISchema): + """ + Container class representing a directory tree. + + Args: + root (Node): The root node of the tree. + + Methods: + print_paths: Prints the paths of the root node and its children. + """ + + root: Node = Field(..., description="Root folder of the directory tree") + + def print_paths(self): + """Prints the paths of the root node and its children.""" + + self.root.print_paths() + + +Node.update_forward_refs() +DirectoryTree.update_forward_refs() + + +@retry(stop=stop_after_attempt(3)) +def parse_tree_to_filesystem(data: str) -> DirectoryTree: + """ + Convert a string representing a directory tree into a filesystem structure + using OpenAI's GPT-3 model. + + Args: + data (str): The string to convert into a filesystem. + + Returns: + DirectoryTree: The directory tree representing the filesystem. + """ + + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + temperature=0.2, + functions=[DirectoryTree.openai_schema], + function_call={"name": DirectoryTree.openai_schema["name"]}, + messages=[ + { + "role": "system", + "content": "You are a perfect file system parsing algorithm. You are given a string representing a directory tree. You must return the correct filesystem structure.", + }, + { + "role": "user", + "content": f"Consider the data below:\n{data} and return the correctly labeled filesystem", + }, + ], + max_tokens=1000, + ) + root = DirectoryTree.from_response(completion) + return root + + +if __name__ == "__main__": + root = parse_tree_to_filesystem( + """ + root + ├── folder1 + │ ├── file1.txt + │ └── file2.txt + └── folder2 + ├── file3.txt + └── subfolder1 + └── file4.txt + """ + ) + root.print_paths() + # >>> root NodeType.FOLDER + # >>> root/folder1 NodeType.FOLDER + # >>> root/folder1/file1.txt NodeType.FILE + # >>> root/folder1/file2.txt NodeType.FILE + # >>> root/folder2 NodeType.FOLDER + # >>> root/folder2/file3.txt NodeType.FILE + # >>> root/folder2/subfolder1 NodeType.FOLDER + # >>> root/folder2/subfolder1/file4.txt NodeType.FILE