voicechat_XLMR

Runtime error

App Files Files Community

Gopikanth123 commited on Jan 2

Commit

3e21064

verified ·

1 Parent(s): 610d318

Update main.py

Browse files

Files changed (1) hide show

main.py +42 -10

main.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import os
 import shutil
-import torch
 from flask import Flask, render_template, request, jsonify
-from whoosh.index import create_in
 from whoosh.fields import Schema, TEXT
 from whoosh.qparser import QueryParser
 from transformers import AutoTokenizer, AutoModel
@@ -14,6 +13,29 @@ PDF_DIRECTORY = 'data'
 os.makedirs(PDF_DIRECTORY, exist_ok=True)
 os.makedirs(PERSIST_DIR, exist_ok=True)
 # Load the XLM-R tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
 model = AutoModel.from_pretrained("xlm-roberta-base")
@@ -21,10 +43,15 @@ model = AutoModel.from_pretrained("xlm-roberta-base")
 # Setup Whoosh schema for indexing
 schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
-# Create an index in the persist directory
-if not os.path.exists(PERSIST_DIR):
-    os.mkdir(PERSIST_DIR)
-index = create_in(PERSIST_DIR, schema)
 # Function to load documents from a directory
 def load_documents():
@@ -34,6 +61,7 @@ def load_documents():
             with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
                 content = file.read()
                 documents.append({'title': filename, 'content': content})
     return documents
 # Function to index documents
@@ -47,15 +75,19 @@ def index_documents(documents):
 def data_ingestion_from_directory():
     # Clear previous data by removing the persist directory
     if os.path.exists(PERSIST_DIR):
-        shutil.rmtree(PERSIST_DIR)  # Remove the persist directory and all its contents
-    # Recreate the persist directory after removal
     os.makedirs(PERSIST_DIR, exist_ok=True)
     # Load new documents from the directory
     new_documents = load_documents()
-    # Index the new documents
     index_documents(new_documents)
 # Function to retrieve documents based on a query
@@ -66,7 +98,7 @@ def retrieve_documents(query):
         results = searcher.search(query_object)
         return [(result['title'], result['content']) for result in results]
-# Function to generate embeddings
 def get_embeddings(text):
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
     with torch.no_grad():

 import os
 import shutil
 from flask import Flask, render_template, request, jsonify
+from whoosh.index import create_in, open_dir
 from whoosh.fields import Schema, TEXT
 from whoosh.qparser import QueryParser
 from transformers import AutoTokenizer, AutoModel
 os.makedirs(PDF_DIRECTORY, exist_ok=True)
 os.makedirs(PERSIST_DIR, exist_ok=True)
+# Load the XLM-R tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+modelHere’s the complete corrected code for your Flask application that utilizes the XLM-R model and integrates Whoosh for indexing, ensuring that it handles the creation of indices properly. This should resolve the `EmptyIndexError` you encountered.
+### Complete Code for RAG Chatbot Using XLM-R
+```python
+import os
+import shutil
+import torch
+from flask import Flask, render_template, request, jsonify
+from whoosh.index import create_in, open_dir
+from whoosh.fields import Schema, TEXT
+from whoosh.qparser import QueryParser
+from transformers import AutoTokenizer, AutoModel
+from deep_translator import GoogleTranslator
+# Set up directories
+PERSIST_DIR = "db"
+PDF_DIRECTORY = 'data'
+os.makedirs(PDF_DIRECTORY, exist_ok=True)
+os.makedirs(PERSIST_DIR, exist_ok=True)
 # Load the XLM-R tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
 model = AutoModel.from_pretrained("xlm-roberta-base")
 # Setup Whoosh schema for indexing
 schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
+# Create or open the Whoosh index
+def create_index():
+    if not os.path.exists(PERSIST_DIR):
+        os.makedirs(PERSIST_DIR)
+        return create_in(PERSIST_DIR, schema)
+    else:
+        return open_dir(PERSIST_DIR)
+index = create_index()
 # Function to load documents from a directory
 def load_documents():
             with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
                 content = file.read()
                 documents.append({'title': filename, 'content': content})
+                print(f"Loaded document: {filename}")  # Debugging line
     return documents
 # Function to index documents
 def data_ingestion_from_directory():
     # Clear previous data by removing the persist directory
     if os.path.exists(PERSIST_DIR):
+        shutil.rmtree(PERSIST_DIR)
     os.makedirs(PERSIST_DIR, exist_ok=True)
     # Load new documents from the directory
     new_documents = load_documents()
+    if not new_documents:
+        print("No documents found to index.")
+        return
+    # Re-create index and index documents
+    global index
+    index = create_index()
     index_documents(new_documents)
 # Function to retrieve documents based on a query
         results = searcher.search(query_object)
         return [(result['title'], result['content']) for result in results]
+# Function to generate embeddings (not used in this example, but can be utilized if needed)
 def get_embeddings(text):
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
     with torch.no_grad():