diff --git a/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb b/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb new file mode 100644 index 000000000..e2cbeef26 --- /dev/null +++ b/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c83b6a4c", + "metadata": {}, + "source": [ + "# Huawei OBS Directory\n", + "The following code demonstrates how to load objects from the Huawei OBS (Object Storage Service) as documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2191935", + "metadata": {}, + "outputs": [], + "source": [ + "# Install the required package\n", + "# pip install esdk-obs-python" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "55fca3b4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import OBSDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c3ed419f", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = \"your-endpoint\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3428fd4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your access credentials\\n\n", + "config = {\n", + " \"ak\": \"your-access-key\",\n", + " \"sk\": \"your-secret-key\"\n", + "}\n", + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9beede9f", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "1e20a839", + "metadata": {}, + "source": [ + "## Specify a Prefix for Loading\n", + "If you want to load objects with a specific prefix from the bucket, you can use the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "125f311d", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config, prefix=\"test_prefix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3488037", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "84c82c0a", + "metadata": {}, + "source": [ + "## Get Authentication Information from ECS\n", + "If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1db99969", + "metadata": {}, + "outputs": [], + "source": [ + "config = {\"get_token_from_ecs\": True}\n", + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57dd9f35", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "30205d25", + "metadata": {}, + "source": [ + "## Use a Public Bucket\n", + "If your bucket's bucket policy allows anonymous access (anonymous users have `listBucket` and `GetObject` permissions), you can directly load the objects without configuring the `config` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4dfa2ef0", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67d4c1d0", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb b/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb new file mode 100644 index 000000000..5617f673c --- /dev/null +++ b/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4394a872", + "metadata": {}, + "source": [ + "# Huawei OBS File\n", + "The following code demonstrates how to load an object from the Huawei OBS (Object Storage Service) as document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c43d811b", + "metadata": {}, + "outputs": [], + "source": [ + "# Install the required package\n", + "# pip install esdk-obs-python" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5e16bae6", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.obs_file import OBSFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "75cc7e7c", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = \"your-endpoint\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f9816984", + "metadata": {}, + "outputs": [], + "source": [ + "from obs import ObsClient\n", + "obs_client = ObsClient(access_key_id=\"your-access-key\", secret_access_key=\"your-secret-key\", server=endpoint)\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", client=obs_client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6143b39b", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "633e05ca", + "metadata": {}, + "source": [ + "## Each Loader with Separate Authentication Information\n", + "If you don't need to reuse OBS connections between different loaders, you can directly configure the `config`. The loader will use the config information to initialize its own OBS client." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a5dd6a5d", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your access credentials\\n\n", + "config = {\n", + " \"ak\": \"your-access-key\",\n", + " \"sk\": \"your-secret-key\"\n", + "}\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\",endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a741f1c", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "1e2e611c", + "metadata": {}, + "source": [ + "## Get Authentication Information from ECS\n", + "If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "338fafef", + "metadata": {}, + "outputs": [], + "source": [ + "config = {\"get_token_from_ecs\": True}\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73976c55", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "b77aa18c", + "metadata": {}, + "source": [ + "## Access a Publicly Accessible Object\n", + "If the object you want to access allows anonymous user access (anonymous users have `GetObject` permission), you can directly load the object without configuring the `config` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "df83d121", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82a844ba", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 34446a33a..4813021f7 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -98,6 +98,8 @@ from langchain.document_loaders.modern_treasury import ModernTreasuryLoader from langchain.document_loaders.notebook import NotebookLoader from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.notiondb import NotionDBLoader +from langchain.document_loaders.obs_directory import OBSDirectoryLoader +from langchain.document_loaders.obs_file import OBSFileLoader from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.odt import UnstructuredODTLoader from langchain.document_loaders.onedrive import OneDriveLoader @@ -251,6 +253,8 @@ __all__ = [ "NotebookLoader", "NotionDBLoader", "NotionDirectoryLoader", + "OBSDirectoryLoader", + "OBSFileLoader", "ObsidianLoader", "OneDriveFileLoader", "OneDriveLoader", diff --git a/libs/langchain/langchain/document_loaders/obs_directory.py b/libs/langchain/langchain/document_loaders/obs_directory.py new file mode 100644 index 000000000..4c81c5ff1 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/obs_directory.py @@ -0,0 +1,82 @@ +# coding:utf-8 +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.obs_file import OBSFileLoader + + +class OBSDirectoryLoader(BaseLoader): + """Loading logic for loading documents from Huawei OBS.""" + + def __init__( + self, + bucket: str, + endpoint: str, + config: Optional[dict] = None, + prefix: str = "", + ): + """Initialize the OBSDirectoryLoader with the specified settings. + + Args: + bucket (str): The name of the OBS bucket to be used. + endpoint (str): The endpoint URL of your OBS bucket. + config (dict): The parameters for connecting to OBS, provided as a dictionary. The dictionary could have the following keys: + - "ak" (str, optional): Your OBS access key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "sk" (str, optional): Your OBS secret key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "token" (str, optional): Your security token (required if using temporary credentials). + - "get_token_from_ecs" (bool, optional): Whether to retrieve the security token from ECS. Defaults to False if not provided. If set to True, `ak`, `sk`, and `token` will be ignored. + prefix (str, optional): The prefix to be added to the OBS key. Defaults to "". + + Note: + Before using this class, make sure you have registered with OBS and have the necessary credentials. The `ak`, `sk`, and `endpoint` values are mandatory unless `get_token_from_ecs` is True or the bucket policy is public read. `token` is required when using temporary credentials. + Example: + To create a new OBSDirectoryLoader: + ``` + config = { + "ak": "your-access-key", + "sk": "your-secret-key" + } + ``` + directory_loader = OBSDirectoryLoader("your-bucket-name", "your-end-endpoint", config, "your-prefix") + """ # noqa: E501 + try: + from obs import ObsClient + except ImportError: + raise ValueError( + "Could not import esdk-obs-python python package. " + "Please install it with `pip install esdk-obs-python`." + ) + if not config: + config = dict() + if config.get("get_token_from_ecs"): + self.client = ObsClient(server=endpoint, security_provider_policy="ECS") + else: + self.client = ObsClient( + access_key_id=config.get("ak"), + secret_access_key=config.get("sk"), + security_token=config.get("token"), + server=endpoint, + ) + + self.bucket = bucket + self.prefix = prefix + + def load(self) -> List[Document]: + """Load documents.""" + max_num = 1000 + mark = None + docs = [] + while True: + resp = self.client.listObjects( + self.bucket, prefix=self.prefix, marker=mark, max_keys=max_num + ) + if resp.status < 300: + for content in resp.body.contents: + loader = OBSFileLoader(self.bucket, content.key, client=self.client) + docs.extend(loader.load()) + if resp.body.is_truncated is True: + mark = resp.body.next_marker + else: + break + return docs diff --git a/libs/langchain/langchain/document_loaders/obs_file.py b/libs/langchain/langchain/document_loaders/obs_file.py new file mode 100644 index 000000000..0e5cdabcd --- /dev/null +++ b/libs/langchain/langchain/document_loaders/obs_file.py @@ -0,0 +1,104 @@ +# coding:utf-8 + +import os +import tempfile +from typing import Any, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class OBSFileLoader(BaseLoader): + """Loader for Huawei OBS file.""" + + def __init__( + self, + bucket: str, + key: str, + client: Any = None, + endpoint: str = "", + config: Optional[dict] = None, + ) -> None: + """Initialize the OBSFileLoader with the specified settings. + + Args: + bucket (str): The name of the OBS bucket to be used. + key (str): The name of the object in the OBS bucket. + client (ObsClient, optional): An instance of the ObsClient to connect to OBS. + endpoint (str, optional): The endpoint URL of your OBS bucket. This parameter is mandatory if `client` is not provided. + config (dict, optional): The parameters for connecting to OBS, provided as a dictionary. This parameter is ignored if `client` is provided. The dictionary could have the following keys: + - "ak" (str, optional): Your OBS access key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "sk" (str, optional): Your OBS secret key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "token" (str, optional): Your security token (required if using temporary credentials). + - "get_token_from_ecs" (bool, optional): Whether to retrieve the security token from ECS. Defaults to False if not provided. If set to True, `ak`, `sk`, and `token` will be ignored. + + Raises: + ValueError: If the `esdk-obs-python` package is not installed. + TypeError: If the provided `client` is not an instance of ObsClient. + ValueError: If `client` is not provided, but `endpoint` is missing. + + Note: + Before using this class, make sure you have registered with OBS and have the necessary credentials. The `ak`, `sk`, and `endpoint` values are mandatory unless `get_token_from_ecs` is True or the bucket policy is public read. `token` is required when using temporary credentials. + + Example: + To create a new OBSFileLoader with a new client: + ``` + config = { + "ak": "your-access-key", + "sk": "your-secret-key" + } + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", config=config) + ``` + + To create a new OBSFileLoader with an existing client: + ``` + from obs import ObsClient + + # Assuming you have an existing ObsClient object 'obs_client' + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", client=obs_client) + ``` + + To create a new OBSFileLoader without an existing client: + ``` + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", endpoint="your-endpoint-url") + ``` + """ # noqa: E501 + try: + from obs import ObsClient + except ImportError: + raise ValueError( + "Could not import esdk-obs-python python package. " + "Please install it with `pip install esdk-obs-python`." + ) + if not client: + if not endpoint: + raise ValueError("Either OBSClient or endpoint must be provided.") + if not config: + config = dict() + if config.get("get_token_from_ecs"): + client = ObsClient(server=endpoint, security_provider_policy="ECS") + else: + client = ObsClient( + access_key_id=config.get("ak"), + secret_access_key=config.get("sk"), + security_token=config.get("token"), + server=endpoint, + ) + if not isinstance(client, ObsClient): + raise TypeError("Client must be ObsClient type") + self.client = client + self.bucket = bucket + self.key = key + + def load(self) -> List[Document]: + """Load documents.""" + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.bucket}/{self.key}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + # Download the file to a destination + self.client.downloadFile( + bucketName=self.bucket, objectKey=self.key, downloadFile=file_path + ) + loader = UnstructuredFileLoader(file_path) + return loader.load()