Multi pdf chatbot

Hide Video?

Implementation:

import streamlit as st
from dotenv import load_dotenv

load_dotenv(".env")


def main():
    st.set_page_config(page_title="Multi PDF Chatbot", page_icon="🔥")
    st.header("Ask questions from multiple PDFs")
    question = st.text_input("Enter your prompt: ")
    if question:
        #get_response_from_llm
        pass


    with st.sidebar:
        st.subheader("Upload PDFs")
        uploaded_files = st.file_uploader("Choose PDFs", type=["pdf"], accept_multiple_files=True)

        if st.button("Process"):
            with st.spinner("Processing..."):
                pass
                # text = get_text_from_pdfs(uploaded_files)

                # extract chunks from text

                # store embeddings in vector store

                # create a chain with memory and pass nearby embeddings of the question to the LLM
 



if __name__ == "__main__":
    main()

The main() function is the entry point of the application. It sets the page configuration (title and icon) using st.set_page_config() and displays a header and a text input field for the user to enter their question or prompt.

The code then checks if a question has been entered. If a question is present, it is expected to call a function (get_response_from_llm) to obtain a response from a language model (LLM), but this function is not implemented in the provided code.

In the sidebar, the application allows users to upload multiple PDF files using st.file_uploader(). When the "Process" button is clicked, a spinner is displayed to indicate that the files are being processed.

Then we are going to extract text from the uploaded PDF files (get_text_from_pdfs()), split the text into chunks, store the embeddings (vector representations) of those chunks in a vector store, and create a chain to pass the relevant embeddings to the language model based on the user's question.

import os
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader

from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import OpenAI


load_dotenv(".env")
api_key = os.getenv("OPENAI_SECRET_KEY")


def get_text_from_pdfs(uploaded_files) -> str:
    text = ""

    for uploaded_file in uploaded_files:
        pdf_reader = PdfReader(uploaded_file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_chunks(text: str) -> list[str]:
    #whenever a line change, respect it and consider it a chunk, else split the text into chunks of 100 characters
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=50, length_function=len)
    splits = text_splitter.split_text(text)
    return text_splitter.create_documents(splits)


def store_embeddings_in_vector_store(chunks):
    embeddings = OpenAIEmbeddings(api_key=api_key)
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store


def conversational_chain(vector_store):
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=OpenAI(api_key=api_key),
        retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
        memory=memory,
        verbose=True
    )
    return chain


def get_response_from_llm(question):
    if st.session_state.chain:
        response = st.session_state.chain({'question': question, 'chat_history': st.session_state.chat_history})
        st.session_state.chat_history = response['chat_history']
        st.write(response.get("answer"))
    else:
        st.write("Please upload and process PDF files first before asking a question.")


def main():
    st.set_page_config(page_title="Multi PDF Chatbot", page_icon="🔥")
    st.header("Ask questions from multiple PDFs")
    question = st.text_input("Enter your prompt: ")
    if "chain" not in st.session_state:
        st.session_state.chain = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

    if question:
        get_response_from_llm(question)

    with st.sidebar:
        st.subheader("Upload PDFs")
        uploaded_files = st.file_uploader("Choose PDFs", type=["pdf"], accept_multiple_files=True)

        if st.button("Process"):
            with st.spinner("Processing..."):
                text = get_text_from_pdfs(uploaded_files)
                # extract chunks from text
                chunks = get_chunks(text)
                st.write(chunks)

                # store embeddings in vector store
                vector_store = store_embeddings_in_vector_store(chunks)
                
                # create a chain with the retriever and the LLM
                st.session_state.chain = conversational_chain(vector_store)


if __name__ == "__main__":
    main()

The get_text_from_pdfs function takes a list of uploaded PDF files and extracts the text from each page of each PDF, concatenating the text into a single string.

The get_chunks function splits the extracted text into smaller chunks, respecting line breaks and using a character-based text splitter with overlapping chunks.

The store_embeddings_in_vector_store function creates embeddings (vector representations) of the text chunks using the OpenAI Embeddings model and stores them in a Chroma vector store.

The conversational_chain function creates a ConversationalRetrievalChain object, which combines a language model (OpenAI), a vector store retriever, and a conversation buffer memory to enable conversational interactions and retrieval of relevant information from the vector store.

The get_response_from_llm function handles the user's question by passing it to the conversational chain and updating the conversation history. It either displays the response from the language model or prompts the user to upload and process PDF files first if the chain is not initialized.

Prev: Using Embeddings to …