tahirsher commited on
Commit
eba40a1
·
verified ·
1 Parent(s): df7ef14

Update docx

Browse files
Files changed (1) hide show
  1. docx +30 -31
docx CHANGED
@@ -6,25 +6,24 @@ from PyPDF2 import PdfReader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
- from transformers import pipeline, AutoModel, AutoTokenizer
10
  import torch
11
 
12
- # Set up the page configuration as the first Streamlit command
13
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
14
 
15
  # Load the summarization pipeline model
16
  @st.cache_resource
17
  def load_summarization_pipeline():
18
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Use a summarization model
19
  return summarizer
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
- # List of Hugging Face PDF URLs
24
- PDF_URLS = [
25
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/administrator92ada0936848e501425591b4ad0cd417.pdf",
26
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/Pakistan%20Penal%20Code.pdf",
27
- ]
28
 
29
  # Helper function to convert Hugging Face blob URLs to direct download URLs
30
  def get_huggingface_raw_url(url):
@@ -32,25 +31,28 @@ def get_huggingface_raw_url(url):
32
  return url.replace("/blob/", "/resolve/")
33
  return url
34
 
35
- # Fetch and extract text from PDF files hosted on Hugging Face
36
- def fetch_pdf_text_from_huggingface(urls):
37
- text = ""
38
- for url in urls:
39
- raw_url = get_huggingface_raw_url(url)
40
- response = requests.get(raw_url)
41
- if response.status_code == 200:
42
- pdf_file = BytesIO(response.content)
43
  try:
 
 
 
44
  pdf_reader = PdfReader(pdf_file)
45
  for page in pdf_reader.pages:
46
  page_text = page.extract_text()
47
  if page_text:
48
- text += page_text
 
 
49
  except Exception as e:
50
  st.error(f"Failed to read PDF from URL {url}: {e}")
51
- else:
52
- st.error(f"Failed to fetch PDF from URL: {url}")
53
- return text
54
 
55
  # Split text into manageable chunks
56
  @st.cache_data
@@ -62,27 +64,27 @@ def get_text_chunks(text):
62
  # Initialize embedding function
63
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
64
 
65
- # Create a FAISS vector store with embeddings
66
  @st.cache_resource
67
  def load_or_create_vector_store(text_chunks):
 
 
 
68
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
69
  return vector_store
70
 
71
  # Generate summary based on the retrieved text
72
  def generate_summary_with_huggingface(query, retrieved_text):
73
- # Concatenate query and retrieved text for summarization
74
  summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
75
-
76
- # Truncate input to fit within the model’s token length limit (approximately 1024 tokens)
77
  max_input_length = 1024
78
  summarization_input = summarization_input[:max_input_length]
79
-
80
- # Generate the summary
81
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
82
  return summary[0]["summary_text"]
83
 
84
  # Generate response for user query
85
  def user_input(user_question, vector_store):
 
 
86
  docs = vector_store.similarity_search(user_question)
87
  context_text = " ".join([doc.page_content for doc in docs])
88
  return generate_summary_with_huggingface(user_question, context_text)
@@ -90,13 +92,10 @@ def user_input(user_question, vector_store):
90
  # Main function to run the Streamlit app
91
  def main():
92
  st.title("📄 Gen AI Lawyers Guide")
93
-
94
- # Load documents from Hugging Face
95
- raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
96
  text_chunks = get_text_chunks(raw_text)
97
  vector_store = load_or_create_vector_store(text_chunks)
98
 
99
- # User question input
100
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
101
 
102
  if st.button("Get Response"):
@@ -108,4 +107,4 @@ def main():
108
  st.markdown(f"**🤖 AI:** {answer}")
109
 
110
  if __name__ == "__main__":
111
- main()
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
+ from transformers import pipeline
10
  import torch
11
 
12
+ # Set up the page configuration
13
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
14
 
15
  # Load the summarization pipeline model
16
  @st.cache_resource
17
  def load_summarization_pipeline():
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
  return summarizer
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
+ # Dictionary of Hugging Face PDF URLs grouped by folders
24
+ PDF_FOLDERS = {
25
+ # Add folder-specific lists of PDF URLs as shown above
26
+ }
 
27
 
28
  # Helper function to convert Hugging Face blob URLs to direct download URLs
29
  def get_huggingface_raw_url(url):
 
31
  return url.replace("/blob/", "/resolve/")
32
  return url
33
 
34
+ # Fetch and extract text from all PDFs in specified folders
35
+ def fetch_pdf_text_from_folders(pdf_folders):
36
+ all_text = ""
37
+ for folder_name, urls in pdf_folders.items():
38
+ folder_text = f"\n[Folder: {folder_name}]\n"
39
+ for url in urls:
40
+ raw_url = get_huggingface_raw_url(url)
 
41
  try:
42
+ response = requests.get(raw_url)
43
+ response.raise_for_status()
44
+ pdf_file = BytesIO(response.content)
45
  pdf_reader = PdfReader(pdf_file)
46
  for page in pdf_reader.pages:
47
  page_text = page.extract_text()
48
  if page_text:
49
+ folder_text += page_text
50
+ except requests.RequestException as e:
51
+ st.error(f"Failed to fetch PDF from URL: {url} - {e}")
52
  except Exception as e:
53
  st.error(f"Failed to read PDF from URL {url}: {e}")
54
+ all_text += folder_text
55
+ return all_text
 
56
 
57
  # Split text into manageable chunks
58
  @st.cache_data
 
64
  # Initialize embedding function
65
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
66
 
67
+ # Create a FAISS vector store with embeddings, checking for empty chunks
68
  @st.cache_resource
69
  def load_or_create_vector_store(text_chunks):
70
+ if not text_chunks:
71
+ st.error("No valid text chunks found to create a vector store. Please check your PDF URLs or file content.")
72
+ return None
73
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
74
  return vector_store
75
 
76
  # Generate summary based on the retrieved text
77
  def generate_summary_with_huggingface(query, retrieved_text):
 
78
  summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
 
 
79
  max_input_length = 1024
80
  summarization_input = summarization_input[:max_input_length]
 
 
81
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
82
  return summary[0]["summary_text"]
83
 
84
  # Generate response for user query
85
  def user_input(user_question, vector_store):
86
+ if vector_store is None:
87
+ return "Vector store is empty due to failed PDF loading or empty documents."
88
  docs = vector_store.similarity_search(user_question)
89
  context_text = " ".join([doc.page_content for doc in docs])
90
  return generate_summary_with_huggingface(user_question, context_text)
 
92
  # Main function to run the Streamlit app
93
  def main():
94
  st.title("📄 Gen AI Lawyers Guide")
95
+ raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
 
 
96
  text_chunks = get_text_chunks(raw_text)
97
  vector_store = load_or_create_vector_store(text_chunks)
98
 
 
99
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
100
 
101
  if st.button("Get Response"):
 
107
  st.markdown(f"**🤖 AI:** {answer}")
108
 
109
  if __name__ == "__main__":
110
+ main()