Gopikanth123 commited on
Commit
3e21064
·
verified ·
1 Parent(s): 610d318

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -10
main.py CHANGED
@@ -1,8 +1,7 @@
1
  import os
2
  import shutil
3
- import torch
4
  from flask import Flask, render_template, request, jsonify
5
- from whoosh.index import create_in
6
  from whoosh.fields import Schema, TEXT
7
  from whoosh.qparser import QueryParser
8
  from transformers import AutoTokenizer, AutoModel
@@ -14,6 +13,29 @@ PDF_DIRECTORY = 'data'
14
  os.makedirs(PDF_DIRECTORY, exist_ok=True)
15
  os.makedirs(PERSIST_DIR, exist_ok=True)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Load the XLM-R tokenizer and model
18
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
19
  model = AutoModel.from_pretrained("xlm-roberta-base")
@@ -21,10 +43,15 @@ model = AutoModel.from_pretrained("xlm-roberta-base")
21
  # Setup Whoosh schema for indexing
22
  schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
23
 
24
- # Create an index in the persist directory
25
- if not os.path.exists(PERSIST_DIR):
26
- os.mkdir(PERSIST_DIR)
27
- index = create_in(PERSIST_DIR, schema)
 
 
 
 
 
28
 
29
  # Function to load documents from a directory
30
  def load_documents():
@@ -34,6 +61,7 @@ def load_documents():
34
  with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
35
  content = file.read()
36
  documents.append({'title': filename, 'content': content})
 
37
  return documents
38
 
39
  # Function to index documents
@@ -47,15 +75,19 @@ def index_documents(documents):
47
  def data_ingestion_from_directory():
48
  # Clear previous data by removing the persist directory
49
  if os.path.exists(PERSIST_DIR):
50
- shutil.rmtree(PERSIST_DIR) # Remove the persist directory and all its contents
51
 
52
- # Recreate the persist directory after removal
53
  os.makedirs(PERSIST_DIR, exist_ok=True)
54
 
55
  # Load new documents from the directory
56
  new_documents = load_documents()
 
 
 
57
 
58
- # Index the new documents
 
 
59
  index_documents(new_documents)
60
 
61
  # Function to retrieve documents based on a query
@@ -66,7 +98,7 @@ def retrieve_documents(query):
66
  results = searcher.search(query_object)
67
  return [(result['title'], result['content']) for result in results]
68
 
69
- # Function to generate embeddings
70
  def get_embeddings(text):
71
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
72
  with torch.no_grad():
 
1
  import os
2
  import shutil
 
3
  from flask import Flask, render_template, request, jsonify
4
+ from whoosh.index import create_in, open_dir
5
  from whoosh.fields import Schema, TEXT
6
  from whoosh.qparser import QueryParser
7
  from transformers import AutoTokenizer, AutoModel
 
13
  os.makedirs(PDF_DIRECTORY, exist_ok=True)
14
  os.makedirs(PERSIST_DIR, exist_ok=True)
15
 
16
+ # Load the XLM-R tokenizer and model
17
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
18
+ modelHere’s the complete corrected code for your Flask application that utilizes the XLM-R model and integrates Whoosh for indexing, ensuring that it handles the creation of indices properly. This should resolve the `EmptyIndexError` you encountered.
19
+
20
+ ### Complete Code for RAG Chatbot Using XLM-R
21
+
22
+ ```python
23
+ import os
24
+ import shutil
25
+ import torch
26
+ from flask import Flask, render_template, request, jsonify
27
+ from whoosh.index import create_in, open_dir
28
+ from whoosh.fields import Schema, TEXT
29
+ from whoosh.qparser import QueryParser
30
+ from transformers import AutoTokenizer, AutoModel
31
+ from deep_translator import GoogleTranslator
32
+
33
+ # Set up directories
34
+ PERSIST_DIR = "db"
35
+ PDF_DIRECTORY = 'data'
36
+ os.makedirs(PDF_DIRECTORY, exist_ok=True)
37
+ os.makedirs(PERSIST_DIR, exist_ok=True)
38
+
39
  # Load the XLM-R tokenizer and model
40
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
41
  model = AutoModel.from_pretrained("xlm-roberta-base")
 
43
  # Setup Whoosh schema for indexing
44
  schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
45
 
46
+ # Create or open the Whoosh index
47
+ def create_index():
48
+ if not os.path.exists(PERSIST_DIR):
49
+ os.makedirs(PERSIST_DIR)
50
+ return create_in(PERSIST_DIR, schema)
51
+ else:
52
+ return open_dir(PERSIST_DIR)
53
+
54
+ index = create_index()
55
 
56
  # Function to load documents from a directory
57
  def load_documents():
 
61
  with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
62
  content = file.read()
63
  documents.append({'title': filename, 'content': content})
64
+ print(f"Loaded document: {filename}") # Debugging line
65
  return documents
66
 
67
  # Function to index documents
 
75
  def data_ingestion_from_directory():
76
  # Clear previous data by removing the persist directory
77
  if os.path.exists(PERSIST_DIR):
78
+ shutil.rmtree(PERSIST_DIR)
79
 
 
80
  os.makedirs(PERSIST_DIR, exist_ok=True)
81
 
82
  # Load new documents from the directory
83
  new_documents = load_documents()
84
+ if not new_documents:
85
+ print("No documents found to index.")
86
+ return
87
 
88
+ # Re-create index and index documents
89
+ global index
90
+ index = create_index()
91
  index_documents(new_documents)
92
 
93
  # Function to retrieve documents based on a query
 
98
  results = searcher.search(query_object)
99
  return [(result['title'], result['content']) for result in results]
100
 
101
+ # Function to generate embeddings (not used in this example, but can be utilized if needed)
102
  def get_embeddings(text):
103
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
104
  with torch.no_grad():