yasirme commited on
Commit
9772c46
·
1 Parent(s): 44870e3
Files changed (4) hide show
  1. app.py +16 -7
  2. rag/RAG.py +3 -3
  3. src/index.html +3 -2
  4. utils/handle_file.py +102 -57
app.py CHANGED
@@ -9,17 +9,26 @@ app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
9
  def index():
10
  return send_file('src/index.html')
11
 
 
12
  @app.route('/upload', methods=['POST'])
13
  def upload():
14
  try:
15
- if 'file' not in request.files:
16
- return jsonify({"error": "No filet"}), 400
17
- file = request.files['file']
18
- if file.filename == '':
19
- return jsonify({"error": "No selected file"}), 400
20
- return file_handler.process_file(file)
 
 
 
 
 
 
 
 
21
  except Exception as e:
22
- return {"error": f"an error occured: {e}"}, 500
23
 
24
  def main():
25
  app.run(host='0.0.0.0', port=7860, debug=True)
 
9
  def index():
10
  return send_file('src/index.html')
11
 
12
+
13
  @app.route('/upload', methods=['POST'])
14
  def upload():
15
  try:
16
+ allowed_chars = request.args.get('allowed_size')
17
+ if 'file' not in request.files and 'files' not in request.files:
18
+ return jsonify({"error": "No files uploaded"}), 400
19
+ if 'files' in request.files:
20
+ files = request.files.getlist('files')
21
+ else:
22
+ files = request.files.getlist('file')
23
+ if not files or not files[0].filename:
24
+ return jsonify({"error": "No files selected"}), 400
25
+ if len(files) == 1:
26
+ return file_handler.process_file(files[0], allowed_chars)
27
+ else:
28
+ return file_handler.process_files(files, allowed_chars)
29
+
30
  except Exception as e:
31
+ return jsonify({"error": f"An error occurred: {e}"}), 500
32
 
33
  def main():
34
  app.run(host='0.0.0.0', port=7860, debug=True)
rag/RAG.py CHANGED
@@ -10,7 +10,7 @@ client = genai.Client(api_key=os.getenv("api_key"))
10
 
11
  class RAG:
12
  def __init__(self):
13
- self.CHUNK_SIZE = 800;
14
  self.CHUNK_OVERLAP = 75;
15
  self.MAX_BATCH_SIZE = 100;
16
  self.MODEL = "text-embedding-004";
@@ -19,8 +19,8 @@ class RAG:
19
  def split_text(self,text):
20
  try:
21
  return RecursiveCharacterTextSplitter(
22
- chunk_size=512,
23
- chunk_overlap=75,
24
  separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
25
  ).split_text(text)
26
  except Exception as e:
 
10
 
11
  class RAG:
12
  def __init__(self):
13
+ self.CHUNK_SIZE = 1024;
14
  self.CHUNK_OVERLAP = 75;
15
  self.MAX_BATCH_SIZE = 100;
16
  self.MODEL = "text-embedding-004";
 
19
  def split_text(self,text):
20
  try:
21
  return RecursiveCharacterTextSplitter(
22
+ chunk_size=self.CHUNK_SIZE,
23
+ chunk_overlap=self.CHUNK_OVERLAP,
24
  separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
25
  ).split_text(text)
26
  except Exception as e:
src/index.html CHANGED
@@ -3,10 +3,11 @@
3
  <head>
4
  <meta charset="UTF-8" />
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
- <title>Hello World</title>
7
  </head>
8
 
9
  <body>
10
- <h1>Hello World</h1>
 
11
  </body>
12
  </html>
 
3
  <head>
4
  <meta charset="UTF-8" />
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title></title>
7
  </head>
8
 
9
  <body>
10
+ <h1>API key is not set</h1>
11
+ <h1>Clone this space and use your own gemini api key</h1>
12
  </body>
13
  </html>
utils/handle_file.py CHANGED
@@ -7,61 +7,79 @@ from rag.RAG import rag
7
  from openpyxl import load_workbook
8
 
9
  ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
10
- MAX_CHARS = 5000000
11
 
12
  class FileHandler:
13
  def __init__(self):
14
- pass
 
 
 
 
 
 
 
 
15
 
16
- def allowed_file(self, filename):
17
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
18
-
19
- def check_char_limit(self, text):
20
- """Check if text exceeds the character limit"""
21
- if len(text.strip()) > MAX_CHARS:
22
- raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS} characters")
23
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- def read_pdf(self, file):
26
- text = ""
27
  try:
 
28
  with pdfplumber.open(file) as pdf:
29
  for page in pdf.pages:
30
  page_text = page.extract_text(layout=True)
31
  if page_text:
32
  text += page_text.strip()
33
- text = self.check_char_limit(text)
34
- return rag.generate_embedding(text.strip())
35
  except Exception as e:
36
- raise ValueError(f"An error occurred while reading the PDF: {e}")
37
 
38
- def read_txt(self, file):
39
  try:
40
  text = file.read().decode("utf-8")
41
- text = self.check_char_limit(text)
42
- return rag.generate_embedding(text.strip())
43
  except Exception as e:
44
- raise ValueError(f"An error occurred while reading the TXT file: {e}")
45
 
46
- def read_docx(self, file):
47
  try:
48
  doc = Document(file)
49
  text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
50
- text = self.check_char_limit(text)
51
- return rag.generate_embedding(text.strip())
52
  except Exception as e:
53
- raise ValueError(f"An error occurred while reading the DOCX file: {e}")
54
 
55
- def read_csv(self, file):
56
  try:
57
  df = pd.read_csv(file)
58
  text = df.to_string(index=False)
59
- text = self.check_char_limit(text)
60
- return rag.generate_embedding(text.strip())
61
  except Exception as e:
62
- raise ValueError(f"An error occurred while reading the CSV file: {e}")
63
 
64
- def read_excel(self, file):
65
  try:
66
  all_text = []
67
  workbook = load_workbook(filename=file)
@@ -77,44 +95,71 @@ class FileHandler:
77
  all_text.append(sheet_text)
78
 
79
  text = "\n\n".join(all_text)
80
- text = self.check_char_limit(text)
81
- return rag.generate_embedding(text.strip())
82
  except Exception as e:
83
- raise ValueError(f"An error occurred while reading the Excel file: {e}")
84
 
85
- def read_json(self, file):
86
  try:
87
  data = json.load(file)
88
  text = json.dumps(data, indent=2)
89
- text = self.check_char_limit(text)
90
- return rag.generate_embedding(text.strip())
91
  except Exception as e:
92
- raise ValueError(f"An error occurred while reading the JSON file: {e}")
93
 
94
- def handle_file(self, file):
95
- filename = file.filename.lower()
96
-
97
- if filename.endswith('.pdf'):
98
- return self.read_pdf(file)
99
- elif filename.endswith('.txt'):
100
- return self.read_txt(file)
101
- elif filename.endswith('.docx'):
102
- return self.read_docx(file)
103
- elif filename.endswith('.csv'):
104
- return self.read_csv(file)
105
- elif filename.endswith(('.xlsx', '.xls')):
106
- return self.read_excel(file)
107
- elif filename.endswith('.json'):
108
- return self.read_json(file)
109
- else:
110
- raise ValueError(f"Unsupported file type: {filename}")
111
 
112
- def process_file(self, file):
113
  try:
114
- if not self.allowed_file(file.filename):
115
- return {"error": f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}"}, 400
116
- return self.handle_file(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  except Exception as e:
118
- return {"error": f"Error processing file: {e}"}, 400
119
 
120
  file_handler = FileHandler()
 
7
  from openpyxl import load_workbook
8
 
9
  ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
10
+ MAX_CHARS_PER_FILE = 5000000 # 5 million characters per file limit
11
 
12
  class FileHandler:
13
  def __init__(self):
14
+ self.file_handlers = {
15
+ 'pdf': self._read_pdf,
16
+ 'txt': self._read_txt,
17
+ 'docx': self._read_docx,
18
+ 'csv': self._read_csv,
19
+ 'xlsx': self._read_excel,
20
+ 'xls': self._read_excel,
21
+ 'json': self._read_json
22
+ }
23
 
24
+ def _validate_params(self, allowed_chars):
25
+ if not allowed_chars:
26
+ return None
27
+
28
+ try:
29
+ return int(allowed_chars)
30
+ except ValueError:
31
+ raise ValueError("allowed_size parameter must be an integer")
32
+
33
+ def _validate_file(self, file):
34
+ if not file or file.filename == '':
35
+ raise ValueError("No file selected")
36
+
37
+ extension = file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else ''
38
+ if extension not in ALLOWED_EXTENSIONS:
39
+ raise ValueError(f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}")
40
+ return extension
41
+
42
+ def _check_char_limit(self, text):
43
+ if len(text.strip()) > MAX_CHARS_PER_FILE:
44
+ raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS_PER_FILE} characters")
45
+ return text.strip()
46
 
47
+ def _read_pdf(self, file):
 
48
  try:
49
+ text = ""
50
  with pdfplumber.open(file) as pdf:
51
  for page in pdf.pages:
52
  page_text = page.extract_text(layout=True)
53
  if page_text:
54
  text += page_text.strip()
55
+ return self._check_char_limit(text)
 
56
  except Exception as e:
57
+ raise ValueError(f"Error reading PDF: {e}")
58
 
59
+ def _read_txt(self, file):
60
  try:
61
  text = file.read().decode("utf-8")
62
+ return self._check_char_limit(text)
 
63
  except Exception as e:
64
+ raise ValueError(f"Error reading TXT: {e}")
65
 
66
+ def _read_docx(self, file):
67
  try:
68
  doc = Document(file)
69
  text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
70
+ return self._check_char_limit(text)
 
71
  except Exception as e:
72
+ raise ValueError(f"Error reading DOCX: {e}")
73
 
74
+ def _read_csv(self, file):
75
  try:
76
  df = pd.read_csv(file)
77
  text = df.to_string(index=False)
78
+ return self._check_char_limit(text)
 
79
  except Exception as e:
80
+ raise ValueError(f"Error reading CSV: {e}")
81
 
82
+ def _read_excel(self, file):
83
  try:
84
  all_text = []
85
  workbook = load_workbook(filename=file)
 
95
  all_text.append(sheet_text)
96
 
97
  text = "\n\n".join(all_text)
98
+ return self._check_char_limit(text)
 
99
  except Exception as e:
100
+ raise ValueError(f"Error reading Excel: {e}")
101
 
102
+ def _read_json(self, file):
103
  try:
104
  data = json.load(file)
105
  text = json.dumps(data, indent=2)
106
+ return self._check_char_limit(text)
 
107
  except Exception as e:
108
+ raise ValueError(f"Error reading JSON: {e}")
109
 
110
+ def read_file(self, file):
111
+ extension = self._validate_file(file)
112
+ return self.file_handlers[extension](file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ def process_file(self, file, allowed_chars):
115
  try:
116
+ allowed_limit = self._validate_params(allowed_chars)
117
+ content = self.read_file(file)
118
+ if allowed_limit and len(content) > allowed_limit:
119
+ return {"error": f"Character count ({len(content)}) exceeds the allowed limit ({allowed_limit})"}, 400
120
+ return rag.generate_embedding(content)
121
+
122
+ except ValueError as e:
123
+ return {"error": str(e)}, 400
124
+ except Exception as e:
125
+ return {"error": f"Unexpected error: {e}"}, 500
126
+
127
+ def process_files(self, files, allowed_chars):
128
+ try:
129
+ allowed_limit = self._validate_params(allowed_chars)
130
+ file_contents = []
131
+ total_chars = 0
132
+ for file in files:
133
+ try:
134
+ content = self.read_file(file)
135
+ file_contents.append((file.filename, content))
136
+ total_chars += len(content)
137
+ except ValueError as e:
138
+ return {"error": f"Error with file '{file.filename}': {str(e)}"}, 400
139
+ if allowed_limit and total_chars > allowed_limit:
140
+ return {"error": f"Total character count ({total_chars}) exceeds the allowed limit ({allowed_limit})"}, 400
141
+
142
+ results = []
143
+ for filename, content in file_contents:
144
+ embedding_result, status_code = rag.generate_embedding(content)
145
+ if status_code != 200:
146
+ return embedding_result, status_code
147
+
148
+ results.append({
149
+ "filename": filename,
150
+ "char_count": len(content),
151
+ "embeddings": embedding_result
152
+ })
153
+
154
+ return {
155
+ "total_char_count": total_chars,
156
+ "file_count": len(files),
157
+ "results": results
158
+ }, 200
159
+
160
+ except ValueError as e:
161
+ return {"error": str(e)}, 400
162
  except Exception as e:
163
+ return {"error": f"Unexpected error: {e}"}, 500
164
 
165
  file_handler = FileHandler()