Spaces:
Runtime error
Runtime error
Update main.py
Browse files
main.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
import os
|
2 |
import shutil
|
3 |
-
import torch
|
4 |
from flask import Flask, render_template, request, jsonify
|
5 |
-
from whoosh.index import create_in
|
6 |
from whoosh.fields import Schema, TEXT
|
7 |
from whoosh.qparser import QueryParser
|
8 |
from transformers import AutoTokenizer, AutoModel
|
@@ -14,6 +13,29 @@ PDF_DIRECTORY = 'data'
|
|
14 |
os.makedirs(PDF_DIRECTORY, exist_ok=True)
|
15 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Load the XLM-R tokenizer and model
|
18 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
|
19 |
model = AutoModel.from_pretrained("xlm-roberta-base")
|
@@ -21,10 +43,15 @@ model = AutoModel.from_pretrained("xlm-roberta-base")
|
|
21 |
# Setup Whoosh schema for indexing
|
22 |
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
|
23 |
|
24 |
-
# Create
|
25 |
-
|
26 |
-
os.
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Function to load documents from a directory
|
30 |
def load_documents():
|
@@ -34,6 +61,7 @@ def load_documents():
|
|
34 |
with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
|
35 |
content = file.read()
|
36 |
documents.append({'title': filename, 'content': content})
|
|
|
37 |
return documents
|
38 |
|
39 |
# Function to index documents
|
@@ -47,15 +75,19 @@ def index_documents(documents):
|
|
47 |
def data_ingestion_from_directory():
|
48 |
# Clear previous data by removing the persist directory
|
49 |
if os.path.exists(PERSIST_DIR):
|
50 |
-
shutil.rmtree(PERSIST_DIR)
|
51 |
|
52 |
-
# Recreate the persist directory after removal
|
53 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
54 |
|
55 |
# Load new documents from the directory
|
56 |
new_documents = load_documents()
|
|
|
|
|
|
|
57 |
|
58 |
-
#
|
|
|
|
|
59 |
index_documents(new_documents)
|
60 |
|
61 |
# Function to retrieve documents based on a query
|
@@ -66,7 +98,7 @@ def retrieve_documents(query):
|
|
66 |
results = searcher.search(query_object)
|
67 |
return [(result['title'], result['content']) for result in results]
|
68 |
|
69 |
-
# Function to generate embeddings
|
70 |
def get_embeddings(text):
|
71 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
72 |
with torch.no_grad():
|
|
|
1 |
import os
|
2 |
import shutil
|
|
|
3 |
from flask import Flask, render_template, request, jsonify
|
4 |
+
from whoosh.index import create_in, open_dir
|
5 |
from whoosh.fields import Schema, TEXT
|
6 |
from whoosh.qparser import QueryParser
|
7 |
from transformers import AutoTokenizer, AutoModel
|
|
|
13 |
os.makedirs(PDF_DIRECTORY, exist_ok=True)
|
14 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
15 |
|
16 |
+
# Load the XLM-R tokenizer and model
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
|
18 |
+
modelHere’s the complete corrected code for your Flask application that utilizes the XLM-R model and integrates Whoosh for indexing, ensuring that it handles the creation of indices properly. This should resolve the `EmptyIndexError` you encountered.
|
19 |
+
|
20 |
+
### Complete Code for RAG Chatbot Using XLM-R
|
21 |
+
|
22 |
+
```python
|
23 |
+
import os
|
24 |
+
import shutil
|
25 |
+
import torch
|
26 |
+
from flask import Flask, render_template, request, jsonify
|
27 |
+
from whoosh.index import create_in, open_dir
|
28 |
+
from whoosh.fields import Schema, TEXT
|
29 |
+
from whoosh.qparser import QueryParser
|
30 |
+
from transformers import AutoTokenizer, AutoModel
|
31 |
+
from deep_translator import GoogleTranslator
|
32 |
+
|
33 |
+
# Set up directories
|
34 |
+
PERSIST_DIR = "db"
|
35 |
+
PDF_DIRECTORY = 'data'
|
36 |
+
os.makedirs(PDF_DIRECTORY, exist_ok=True)
|
37 |
+
os.makedirs(PERSIST_DIR, exist_ok=True)
|
38 |
+
|
39 |
# Load the XLM-R tokenizer and model
|
40 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
|
41 |
model = AutoModel.from_pretrained("xlm-roberta-base")
|
|
|
43 |
# Setup Whoosh schema for indexing
|
44 |
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
|
45 |
|
46 |
+
# Create or open the Whoosh index
|
47 |
+
def create_index():
|
48 |
+
if not os.path.exists(PERSIST_DIR):
|
49 |
+
os.makedirs(PERSIST_DIR)
|
50 |
+
return create_in(PERSIST_DIR, schema)
|
51 |
+
else:
|
52 |
+
return open_dir(PERSIST_DIR)
|
53 |
+
|
54 |
+
index = create_index()
|
55 |
|
56 |
# Function to load documents from a directory
|
57 |
def load_documents():
|
|
|
61 |
with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
|
62 |
content = file.read()
|
63 |
documents.append({'title': filename, 'content': content})
|
64 |
+
print(f"Loaded document: {filename}") # Debugging line
|
65 |
return documents
|
66 |
|
67 |
# Function to index documents
|
|
|
75 |
def data_ingestion_from_directory():
|
76 |
# Clear previous data by removing the persist directory
|
77 |
if os.path.exists(PERSIST_DIR):
|
78 |
+
shutil.rmtree(PERSIST_DIR)
|
79 |
|
|
|
80 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
81 |
|
82 |
# Load new documents from the directory
|
83 |
new_documents = load_documents()
|
84 |
+
if not new_documents:
|
85 |
+
print("No documents found to index.")
|
86 |
+
return
|
87 |
|
88 |
+
# Re-create index and index documents
|
89 |
+
global index
|
90 |
+
index = create_index()
|
91 |
index_documents(new_documents)
|
92 |
|
93 |
# Function to retrieve documents based on a query
|
|
|
98 |
results = searcher.search(query_object)
|
99 |
return [(result['title'], result['content']) for result in results]
|
100 |
|
101 |
+
# Function to generate embeddings (not used in this example, but can be utilized if needed)
|
102 |
def get_embeddings(text):
|
103 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
104 |
with torch.no_grad():
|