Implementation:
import streamlit as st
from dotenv import load_dotenv
load_dotenv(".env")
def main():
st.set_page_config(page_title="Multi PDF Chatbot", page_icon="🔥")
st.header("Ask questions from multiple PDFs")
question = st.text_input("Enter your prompt: ")
if question:
#get_response_from_llm
pass
with st.sidebar:
st.subheader("Upload PDFs")
uploaded_files = st.file_uploader("Choose PDFs", type=["pdf"], accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing..."):
pass
# text = get_text_from_pdfs(uploaded_files)
# extract chunks from text
# store embeddings in vector store
# create a chain with memory and pass nearby embeddings of the question to the LLM
if __name__ == "__main__":
main()
The main()
function is the entry point of the application. It sets the page configuration (title and icon) using st.set_page_config()
and displays a header and a text input field for the user to enter their question or prompt.
The code then checks if a question has been entered. If a question is present, it is expected to call a function (get_response_from_llm
) to obtain a response from a language model (LLM), but this function is not implemented in the provided code.
In the sidebar, the application allows users to upload multiple PDF files using st.file_uploader()
. When the "Process" button is clicked, a spinner is displayed to indicate that the files are being processed.
Then we are going to extract text from the uploaded PDF files (get_text_from_pdfs()
), split the text into chunks, store the embeddings (vector representations) of those chunks in a vector store, and create a chain to pass the relevant embeddings to the language model based on the user's question.
import os
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import OpenAI
load_dotenv(".env")
api_key = os.getenv("OPENAI_SECRET_KEY")
def get_text_from_pdfs(uploaded_files) -> str:
text = ""
for uploaded_file in uploaded_files:
pdf_reader = PdfReader(uploaded_file)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_chunks(text: str) -> list[str]:
#whenever a line change, respect it and consider it a chunk, else split the text into chunks of 100 characters
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=50, length_function=len)
splits = text_splitter.split_text(text)
return text_splitter.create_documents(splits)
def store_embeddings_in_vector_store(chunks):
embeddings = OpenAIEmbeddings(api_key=api_key)
vector_store = Chroma.from_documents(chunks, embeddings)
return vector_store
def conversational_chain(vector_store):
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chain = ConversationalRetrievalChain.from_llm(
llm=OpenAI(api_key=api_key),
retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
memory=memory,
verbose=True
)
return chain
def get_response_from_llm(question):
if st.session_state.chain:
response = st.session_state.chain({'question': question, 'chat_history': st.session_state.chat_history})
st.session_state.chat_history = response['chat_history']
st.write(response.get("answer"))
else:
st.write("Please upload and process PDF files first before asking a question.")
def main():
st.set_page_config(page_title="Multi PDF Chatbot", page_icon="🔥")
st.header("Ask questions from multiple PDFs")
question = st.text_input("Enter your prompt: ")
if "chain" not in st.session_state:
st.session_state.chain = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if question:
get_response_from_llm(question)
with st.sidebar:
st.subheader("Upload PDFs")
uploaded_files = st.file_uploader("Choose PDFs", type=["pdf"], accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing..."):
text = get_text_from_pdfs(uploaded_files)
# extract chunks from text
chunks = get_chunks(text)
st.write(chunks)
# store embeddings in vector store
vector_store = store_embeddings_in_vector_store(chunks)
# create a chain with the retriever and the LLM
st.session_state.chain = conversational_chain(vector_store)
if __name__ == "__main__":
main()
The get_text_from_pdfs
function takes a list of uploaded PDF files and extracts the text from each page of each PDF, concatenating the text into a single string.
The get_chunks
function splits the extracted text into smaller chunks, respecting line breaks and using a character-based text splitter with overlapping chunks.
The store_embeddings_in_vector_store
function creates embeddings (vector representations) of the text chunks using the OpenAI Embeddings model and stores them in a Chroma vector store.
The conversational_chain
function creates a ConversationalRetrievalChain object, which combines a language model (OpenAI), a vector store retriever, and a conversation buffer memory to enable conversational interactions and retrieval of relevant information from the vector store.
The get_response_from_llm
function handles the user's question by passing it to the conversational chain and updating the conversation history. It either displays the response from the language model or prompts the user to upload and process PDF files first if the chain is not initialized.