Spaces:

yasirme
/

RAG-retrieval

Running

App Files Files Community

yasirme commited on 30 days ago

Commit

9772c46

1 Parent(s): 44870e3

1

Browse files

Files changed (4) hide show

app.py +16 -7
rag/RAG.py +3 -3
src/index.html +3 -2
utils/handle_file.py +102 -57

app.py CHANGED Viewed

@@ -9,17 +9,26 @@ app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
 def index():
     return send_file('src/index.html')
 @app.route('/upload', methods=['POST'])
 def upload():
     try:
-        if 'file' not in request.files:
-            return jsonify({"error": "No filet"}), 400
-        file = request.files['file']
-        if file.filename == '':
-            return jsonify({"error": "No selected file"}), 400
-        return file_handler.process_file(file)
     except Exception as e:
-        return {"error": f"an error occured: {e}"}, 500
 def main():
     app.run(host='0.0.0.0', port=7860, debug=True)

 def index():
     return send_file('src/index.html')
 @app.route('/upload', methods=['POST'])
 def upload():
     try:
+        allowed_chars = request.args.get('allowed_size')
+        if 'file' not in request.files and 'files' not in request.files:
+            return jsonify({"error": "No files uploaded"}), 400
+        if 'files' in request.files:
+            files = request.files.getlist('files')
+        else:
+            files = request.files.getlist('file')
+        if not files or not files[0].filename:
+            return jsonify({"error": "No files selected"}), 400
+        if len(files) == 1:
+            return file_handler.process_file(files[0], allowed_chars)
+        else:
+            return file_handler.process_files(files, allowed_chars)
     except Exception as e:
+        return jsonify({"error": f"An error occurred: {e}"}), 500
 def main():
     app.run(host='0.0.0.0', port=7860, debug=True)

rag/RAG.py CHANGED Viewed

@@ -10,7 +10,7 @@ client = genai.Client(api_key=os.getenv("api_key"))
 class RAG:
     def __init__(self):
-        self.CHUNK_SIZE = 800;
         self.CHUNK_OVERLAP = 75;
         self.MAX_BATCH_SIZE = 100;
         self.MODEL = "text-embedding-004";
@@ -19,8 +19,8 @@ class RAG:
     def split_text(self,text):
         try:
             return RecursiveCharacterTextSplitter(
-                chunk_size=512,
-                chunk_overlap=75,
                 separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
             ).split_text(text)
         except Exception as e:

 class RAG:
     def __init__(self):
+        self.CHUNK_SIZE = 1024;
         self.CHUNK_OVERLAP = 75;
         self.MAX_BATCH_SIZE = 100;
         self.MODEL = "text-embedding-004";
     def split_text(self,text):
         try:
             return RecursiveCharacterTextSplitter(
+                chunk_size=self.CHUNK_SIZE,
+                chunk_overlap=self.CHUNK_OVERLAP,
                 separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
             ).split_text(text)
         except Exception as e:

src/index.html CHANGED Viewed

@@ -3,10 +3,11 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Hello World</title>
   </head>
   <body>
-    <h1>Hello World</h1>
   </body>
 </html>

   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title></title>
   </head>
   <body>
+    <h1>API key is not set</h1>
+    <h1>Clone this space and use your own gemini api key</h1>
   </body>
 </html>

utils/handle_file.py CHANGED Viewed

@@ -7,61 +7,79 @@ from rag.RAG import rag
 from openpyxl import load_workbook
 ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
-MAX_CHARS = 5000000
 class FileHandler:
     def __init__(self):
-        pass
-    def allowed_file(self, filename):
-        return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-    def check_char_limit(self, text):
-        """Check if text exceeds the character limit"""
-        if len(text.strip()) > MAX_CHARS:
-            raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS} characters")
-        return text
-    def read_pdf(self, file):
-        text = ""
         try:
             with pdfplumber.open(file) as pdf:
                 for page in pdf.pages:
                     page_text = page.extract_text(layout=True)
                     if page_text:
                         text += page_text.strip()
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the PDF: {e}")
-    def read_txt(self, file):
         try:
             text = file.read().decode("utf-8")
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the TXT file: {e}")
-    def read_docx(self, file):
         try:
             doc = Document(file)
             text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the DOCX file: {e}")
-    def read_csv(self, file):
         try:
             df = pd.read_csv(file)
             text = df.to_string(index=False)
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the CSV file: {e}")
-    def read_excel(self, file):
         try:
             all_text = []
             workbook = load_workbook(filename=file)
@@ -77,44 +95,71 @@ class FileHandler:
                 all_text.append(sheet_text)
             text = "\n\n".join(all_text)
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the Excel file: {e}")
-    def read_json(self, file):
         try:
             data = json.load(file)
             text = json.dumps(data, indent=2)
-            text = self.check_char_limit(text)
-            return rag.generate_embedding(text.strip())
         except Exception as e:
-            raise ValueError(f"An error occurred while reading the JSON file: {e}")
-    def handle_file(self, file):
-        filename = file.filename.lower()
-        if filename.endswith('.pdf'):
-            return self.read_pdf(file)
-        elif filename.endswith('.txt'):
-            return self.read_txt(file)
-        elif filename.endswith('.docx'):
-            return self.read_docx(file)
-        elif filename.endswith('.csv'):
-            return self.read_csv(file)
-        elif filename.endswith(('.xlsx', '.xls')):
-            return self.read_excel(file)
-        elif filename.endswith('.json'):
-            return self.read_json(file)
-        else:
-            raise ValueError(f"Unsupported file type: {filename}")
-    def process_file(self, file):
         try:
-            if not self.allowed_file(file.filename):
-                return {"error": f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}"}, 400
-            return self.handle_file(file)
         except Exception as e:
-            return {"error": f"Error processing file: {e}"}, 400
 file_handler = FileHandler()

 from openpyxl import load_workbook
 ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
+MAX_CHARS_PER_FILE = 5000000  # 5 million characters per file limit
 class FileHandler:
     def __init__(self):
+        self.file_handlers = {
+            'pdf': self._read_pdf,
+            'txt': self._read_txt,
+            'docx': self._read_docx,
+            'csv': self._read_csv,
+            'xlsx': self._read_excel,
+            'xls': self._read_excel,
+            'json': self._read_json
+        }
+    def _validate_params(self, allowed_chars):
+        if not allowed_chars:
+            return None
+        try:
+            return int(allowed_chars)
+        except ValueError:
+            raise ValueError("allowed_size parameter must be an integer")
+    def _validate_file(self, file):
+        if not file or file.filename == '':
+            raise ValueError("No file selected")
+        extension = file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else ''
+        if extension not in ALLOWED_EXTENSIONS:
+            raise ValueError(f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}")
+        return extension
+    def _check_char_limit(self, text):
+        if len(text.strip()) > MAX_CHARS_PER_FILE:
+            raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS_PER_FILE} characters")
+        return text.strip()
+    def _read_pdf(self, file):
         try:
+            text = ""
             with pdfplumber.open(file) as pdf:
                 for page in pdf.pages:
                     page_text = page.extract_text(layout=True)
                     if page_text:
                         text += page_text.strip()
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading PDF: {e}")
+    def _read_txt(self, file):
         try:
             text = file.read().decode("utf-8")
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading TXT: {e}")
+    def _read_docx(self, file):
         try:
             doc = Document(file)
             text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading DOCX: {e}")
+    def _read_csv(self, file):
         try:
             df = pd.read_csv(file)
             text = df.to_string(index=False)
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading CSV: {e}")
+    def _read_excel(self, file):
         try:
             all_text = []
             workbook = load_workbook(filename=file)
                 all_text.append(sheet_text)
             text = "\n\n".join(all_text)
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading Excel: {e}")
+    def _read_json(self, file):
         try:
             data = json.load(file)
             text = json.dumps(data, indent=2)
+            return self._check_char_limit(text)
         except Exception as e:
+            raise ValueError(f"Error reading JSON: {e}")
+    def read_file(self, file):
+        extension = self._validate_file(file)
+        return self.file_handlers[extension](file)
+    def process_file(self, file, allowed_chars):
         try:
+            allowed_limit = self._validate_params(allowed_chars)
+            content = self.read_file(file)
+            if allowed_limit and len(content) > allowed_limit:
+                return {"error": f"Character count ({len(content)}) exceeds the allowed limit ({allowed_limit})"}, 400
+            return rag.generate_embedding(content)
+        except ValueError as e:
+            return {"error": str(e)}, 400
+        except Exception as e:
+            return {"error": f"Unexpected error: {e}"}, 500
+    def process_files(self, files, allowed_chars):
+        try:
+            allowed_limit = self._validate_params(allowed_chars)
+            file_contents = []
+            total_chars = 0
+            for file in files:
+                try:
+                    content = self.read_file(file)
+                    file_contents.append((file.filename, content))
+                    total_chars += len(content)
+                except ValueError as e:
+                    return {"error": f"Error with file '{file.filename}': {str(e)}"}, 400
+            if allowed_limit and total_chars > allowed_limit:
+                return {"error": f"Total character count ({total_chars}) exceeds the allowed limit ({allowed_limit})"}, 400
+            results = []
+            for filename, content in file_contents:
+                embedding_result, status_code = rag.generate_embedding(content)
+                if status_code != 200:
+                    return embedding_result, status_code
+                results.append({
+                    "filename": filename,
+                    "char_count": len(content),
+                    "embeddings": embedding_result
+                })
+            return {
+                "total_char_count": total_chars,
+                "file_count": len(files),
+                "results": results
+            }, 200
+        except ValueError as e:
+            return {"error": str(e)}, 400
         except Exception as e:
+            return {"error": f"Unexpected error: {e}"}, 500
 file_handler = FileHandler()