diff --git a/docs/extras/integrations/document_loaders/grobid.ipynb b/docs/extras/integrations/document_loaders/grobid.ipynb index 96bf6b8dd..83ffffcc2 100644 --- a/docs/extras/integrations/document_loaders/grobid.ipynb +++ b/docs/extras/integrations/document_loaders/grobid.ipynb @@ -9,66 +9,16 @@ "\n", "GROBID is a machine learning library for extracting, parsing, and re-structuring raw documents.\n", "\n", - "It is particularly good for sturctured PDFs, like academic papers.\n", + "It is designed and expected to be used to parse academic papers, where it works particularly well. Note: if the articles supplied to Grobid are large documents (e.g. dissertations) exceeding a certain number of elements, they might not be processed. \n", "\n", - "This loader uses GROBIB to parse PDFs into `Documents` that retain metadata associated with the section of text.\n", + "This loader uses Grobid to parse PDFs into `Documents` that retain metadata associated with the section of text.\n", "\n", "---\n", + "The best approach is to install Grobid via docker, see https://grobid.readthedocs.io/en/latest/Grobid-docker/. \n", "\n", - "For users on `Mac` - \n", + "(Note: additional instructions can be found [here](https://python.langchain.com/docs/extras/integrations/providers/grobid.mdx).)\n", "\n", - "(Note: additional instructions can be found [here](https://python.langchain.com/docs/ecosystem/integrations/grobid.mdx).)\n", - "\n", - "Install Java (Apple Silicon):\n", - "```\n", - "$ arch -arm64 brew install openjdk@11\n", - "$ brew --prefix openjdk@11\n", - "/opt/homebrew/opt/openjdk@ 11\n", - "```\n", - "\n", - "In `~/.zshrc`:\n", - "```\n", - "export JAVA_HOME=/opt/homebrew/opt/openjdk@11\n", - "export PATH=$JAVA_HOME/bin:$PATH\n", - "```\n", - "\n", - "Then, in Terminal:\n", - "```\n", - "$ source ~/.zshrc\n", - "```\n", - "\n", - "Confirm install:\n", - "```\n", - "$ which java\n", - "/opt/homebrew/opt/openjdk@11/bin/java\n", - "$ java -version \n", - "openjdk version \"11.0.19\" 2023-04-18\n", - "OpenJDK Runtime Environment Homebrew (build 11.0.19+0)\n", - "OpenJDK 64-Bit Server VM Homebrew (build 11.0.19+0, mixed mode)\n", - "```\n", - "\n", - "Then, get [Grobid](https://grobid.readthedocs.io/en/latest/Install-Grobid/#getting-grobid):\n", - "```\n", - "$ curl -LO https://github.com/kermitt2/grobid/archive/0.7.3.zip\n", - "$ unzip 0.7.3.zip\n", - "```\n", - " \n", - "Build\n", - "```\n", - "$ ./gradlew clean install\n", - "```\n", - "\n", - "Then, run the server:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "2d8992fc", - "metadata": {}, - "outputs": [], - "source": [ - "! get_ipython().system_raw('nohup ./gradlew run > grobid.log 2>&1 &')" + "Once grobid is up-and-running you can interact as described below. \n" ] }, { diff --git a/docs/extras/integrations/providers/grobid.mdx b/docs/extras/integrations/providers/grobid.mdx index 6a24e68ba..4fd52abe2 100644 --- a/docs/extras/integrations/providers/grobid.mdx +++ b/docs/extras/integrations/providers/grobid.mdx @@ -1,22 +1,23 @@ # Grobid +GROBID is a machine learning library for extracting, parsing, and re-structuring raw documents. + +It is designed and expected to be used to parse academic papers, where it works particularly well. + +*Note*: if the articles supplied to Grobid are large documents (e.g. dissertations) exceeding a certain number +of elements, they might not be processed. + This page covers how to use the Grobid to parse articles for LangChain. -It is separated into two parts: installation and running the server -## Installation and Setup -#Ensure You have Java installed -!apt-get install -y openjdk-11-jdk -q -!update-alternatives --set java /usr/lib/jvm/java-11-openjdk-amd64/bin/java +## Installation +The grobid installation is described in details in https://grobid.readthedocs.io/en/latest/Install-Grobid/. +However, it is probably easier and less troublesome to run grobid through a docker container, +as documented [here](https://grobid.readthedocs.io/en/latest/Grobid-docker/). -#Clone and install the Grobid Repo -import os -!git clone https://github.com/kermitt2/grobid.git -os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64" -os.chdir('grobid') -!./gradlew clean install +## Use Grobid with LangChain -#Run the server, -get_ipython().system_raw('nohup ./gradlew run > grobid.log 2>&1 &') +Once grobid is installed and up and running (you can check by accessing it http://localhost:8070), +you're ready to go. You can now use the GrobidParser to produce documents ```python @@ -41,4 +42,5 @@ loader = GenericLoader.from_filesystem( ) docs = loader.load() ``` -Chunk metadata will include bboxes although these are a bit funky to parse, see https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ +Chunk metadata will include Bounding Boxes. Although these are a bit funky to parse, +they are explained in https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ \ No newline at end of file