diff --git a/examples/citation_with_extraction/README.md b/examples/citation_with_extraction/README.md new file mode 100644 index 0000000..9e379a2 --- /dev/null +++ b/examples/citation_with_extraction/README.md @@ -0,0 +1,68 @@ +# Citation with Extraction + +This repository contains a FastAPI application that uses GPT-4 to answer questions based on a given context and extract relevant facts with correct and exact citations. The extracted facts are returned as JSON events using Server-Sent Events (SSE). + +## How it Works + +The FastAPI app defines an endpoint `/extract` that accepts a POST request with JSON data containing a `context` and a `query`. The `context` represents the text from which the question is being asked, and the `query` is the question itself. + +The app leverages GPT-4, an advanced language model, to generate answers to the questions and extract relevant facts. It ensures that the extracted facts include direct quotes from the given context. + +## Example Usage + +To use the `/extract` endpoint, send a POST request with `curl` or any HTTP client with the following format: + +```bash +curl -X POST -H "Content-Type: application/json" -d '{ + "context": "My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics. As part of coop I worked at many companies including Stitchfix, Facebook. I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.", + "query": "What did the author do in school?" +}' -N http://localhost:8000/extract +``` + +Replace `http://localhost:8000` with the actual URL of your FastAPI app if it's running on a different host and port. The API will respond with Server-Sent Events (SSE) containing the extracted facts in real-time. + +## Bring your own API key + +If you have your own api key but dont want to try deploying it yourself you're welcome to use my +modal isntance here, this code is public and I do not store your key. + +```bash +curl -X 'POST' \ + 'https://jxnl--rag-citation-fastapi-app.modal.run/extract' \ + -H 'accept: */*' \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ' \ + -d '{ + "context": "My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics. As part of coop I worked at many companies including Stitchfix, Facebook. I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.", + "query": "What did the author do in school?" +}' +``` + + +## Requirements + +To run this application, ensure you have the following Python packages installed: + +```bash +pip install -r requirements.txt +``` + +## Running the App + +To run the FastAPI app, execute the following command: + +```bash +uvicorn main:app --reload +``` + +This will start the server, and the `/extract` endpoint will be available at `http://localhost:8000/extract`. + +## Note + +Ensure that you have a valid API key for GPT-4 from OpenAI. If you don't have one, you can obtain it from the OpenAI website. + +Please use this application responsibly and be mindful of any usage limits or restrictions from OpenAI's API usage policy. + +## License + +This project is licensed under the [MIT License](LICENSE). Feel free to use, modify, and distribute it as you see fit. \ No newline at end of file diff --git a/examples/citation_with_extraction/citation_fuzzy_match.py b/examples/citation_with_extraction/citation_fuzzy_match.py index 69f8c82..a746b02 100644 --- a/examples/citation_with_extraction/citation_fuzzy_match.py +++ b/examples/citation_with_extraction/citation_fuzzy_match.py @@ -92,10 +92,7 @@ def ask_ai(question: str, context: str) -> QuestionAnswer: question = "What did the author do during college?" context = """ -My name is Jason Liu, and I grew up in Toronto Canada but I was born in China. -I went to an arts highschool but in university I studied Computational Mathematics and physics. -As part of coop I worked at many companies including Stitchfix, Facebook. -I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years. +My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics. As part of coop I worked at many companies including Stitchfix, Facebook. I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years. """ diff --git a/examples/citation_with_extraction/main.py b/examples/citation_with_extraction/main.py new file mode 100644 index 0000000..87441a1 --- /dev/null +++ b/examples/citation_with_extraction/main.py @@ -0,0 +1,124 @@ +from typing import Iterable, List +from fastapi import FastAPI, Request, HTTPException +from fastapi.params import Depends +from openai_function_call import MultiTask +from pydantic import BaseModel, Field +from starlette.responses import StreamingResponse + +import os +import openai + + +# FastAPI app +app = FastAPI( + title="Citation with Extraction", +) + + +class SubResponse(BaseModel): + """ + If there are multiple phrases with difference citations. Each one should be its own object. + make sure to break them apart such that each one only uses a set of + sources that are relevant to it. + """ + + body: str = Field(..., description="Body of the sentences, as part of a response") + substring_quotes: List[str] = Field( + ..., + description="Each source should be a direct quote from the context, as a substring of the original content but should be a wide enough quote to capture the context of the quote. The citation should at least be long and capture the context and be a full sentence.", + ) + + def _get_span(self, quote, context): + import regex + + minor = quote + major = context + + errs_ = 0 + s = regex.search(f"({minor}){{e<={errs_}}}", major) + while s is None and errs_ <= len(context) * 0.05: + errs_ += 1 + s = regex.search(f"({minor}){{e<={errs_}}}", major) + + if s is not None: + yield from s.spans() + + def get_spans(self, context): + if self.substring_quotes: + for quote in self.substring_quotes: + yield from self._get_span(quote, context) + + +Answers = MultiTask( + SubResponse, + name="Answer", + description="Correctly answer questions based on a context. Quotes should be full sentences when possible", +) + + +class Question(BaseModel): + context: str = Field(..., description="Context to extract answers from") + query: str = Field(..., description="Question to answer") + + +# Function to extract entities from input text using GPT-3.5 +def stream_extract(question: Question) -> Iterable[SubResponse]: + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + temperature=0, + stream=True, + functions=[Answers.openai_schema], + function_call={"name": Answers.openai_schema["name"]}, + messages=[ + { + "role": "system", + "content": f"You are a world class algorithm to answer questions with correct and exact citations. ", + }, + {"role": "user", "content": f"Answer question using the following context"}, + {"role": "user", "content": f"{question.context}"}, + {"role": "user", "content": f"Question: {question.query}"}, + { + "role": "user", + "content": f"Tips: Make sure to cite your sources, and use the exact words from the context.", + }, + ], + max_tokens=2000, + ) + return Answers.from_streaming_response(completion) + + +def get_api_key(request: Request): + """ + This just gets the API key from the request headers. + but tries to read from the environment variable OPENAI_API_KEY first. + """ + if "OPENAI_API_KEY" in os.environ: + return os.environ["OPENAI_API_KEY"] + + auth = request.headers.get("Authorization") + if auth is None: + raise HTTPException(status_code=401, detail="Missing Authorization header") + + if auth.startswith("Bearer "): + return auth.replace("Bearer ", "") + + return None + + +# Route to handle SSE events and return users +@app.post("/extract", response_class=StreamingResponse) +async def extract(question: Question, openai_key=Depends(get_api_key)): + openai.api_key = openai_key + facts = stream_extract(question) + + async def generate(): + for fact in facts: + spans = list(fact.get_spans(question.context)) + resp = { + "body": fact.body, + "spans": spans, + "citation": [question.context[a:b] for (a, b) in spans], + } + yield f"data: {resp}" + + return StreamingResponse(generate(), media_type="text/event-stream") diff --git a/examples/citation_with_extraction/modal_main.py b/examples/citation_with_extraction/modal_main.py new file mode 100644 index 0000000..0733d8f --- /dev/null +++ b/examples/citation_with_extraction/modal_main.py @@ -0,0 +1,14 @@ +from main import app +import modal + +stub = modal.Stub("rag-citation") + +image = modal.Image.debian_slim().pip_install( + "fastapi", "openai_function_call>=0.2.1", "regex" +) + + +@stub.function(image=image) +@modal.asgi_app() +def fastapi_app(): + return app diff --git a/examples/citation_with_extraction/requirements.txt b/examples/citation_with_extraction/requirements.txt new file mode 100644 index 0000000..e62c77e --- /dev/null +++ b/examples/citation_with_extraction/requirements.txt @@ -0,0 +1,5 @@ +fastapi +uvicorn +openai +pydantic +openai_function_call \ No newline at end of file diff --git a/openai_function_call/dsl/multitask.py b/openai_function_call/dsl/multitask.py index f3ffef0..2bbdd8c 100644 --- a/openai_function_call/dsl/multitask.py +++ b/openai_function_call/dsl/multitask.py @@ -1,5 +1,5 @@ -from pydantic import create_model, Field -from typing import Optional, List, Type +from pydantic import BaseModel, create_model, Field +from typing import Optional, List, Type, Union from openai_function_call import OpenAISchema @@ -48,7 +48,7 @@ class MultiTaskBase: def MultiTask( - subtask_class: Type[OpenAISchema], + subtask_class: Type[BaseModel], name: Optional[str] = None, description: Optional[str] = None, ):