cutechicken commited on
Commit
fcd720a
·
verified ·
1 Parent(s): d6a3ccb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -119
app.py CHANGED
@@ -128,6 +128,9 @@ def find_relevant_context(query, top_k=3):
128
 
129
  return relevant_contexts
130
 
 
 
 
131
  def analyze_file_content(content, file_type):
132
  """Analyze file content and return structural summary"""
133
  if file_type in ['parquet', 'csv']:
@@ -136,9 +139,9 @@ def analyze_file_content(content, file_type):
136
  header = lines[0]
137
  columns = header.count('|') - 1
138
  rows = len(lines) - 3
139
- return f"📊 데이터셋 구조: {columns} 컬럼, {rows} 데이터"
140
  except:
141
- return "❌ 데이터셋 구조 분석 실패"
142
 
143
  lines = content.split('\n')
144
  total_lines = len(lines)
@@ -148,51 +151,11 @@ def analyze_file_content(content, file_type):
148
  functions = len([line for line in lines if 'def ' in line])
149
  classes = len([line for line in lines if 'class ' in line])
150
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
151
- return f"💻 코드 구조: {total_lines} (함수: {functions}, 클래스: {classes}, 임포트: {imports})"
152
 
153
  paragraphs = content.count('\n\n') + 1
154
  words = len(content.split())
155
- return f"📝 문서 구조: {total_lines}줄, {paragraphs}단락, {words}단어"
156
-
157
-
158
- def extract_pdf_text_with_ocr(file_path):
159
- try:
160
- # Poppler 경로 설정
161
- if platform.system() == 'Windows':
162
- poppler_path = r"C:\Program Files\poppler-0.68.0\bin"
163
- else:
164
- poppler_path = None # Linux의 경우 기본 경로 사용
165
-
166
- # PDF를 이미지로 변환
167
- images = convert_from_path(
168
- file_path,
169
- poppler_path=poppler_path,
170
- fmt='jpeg',
171
- grayscale=False,
172
- size=(1700, None) # 해상도 향상
173
- )
174
-
175
- # 전체 텍스트 저장
176
- text = ""
177
-
178
- # 각 페이지에 대해 OCR 수행
179
- for i, image in enumerate(images):
180
- try:
181
- # OCR 설정
182
- custom_config = r'--oem 3 --psm 6 -l kor+eng'
183
- # OCR 수행
184
- page_text = pytesseract.image_to_string(
185
- image,
186
- config=custom_config
187
- )
188
- text += f"\n--- 페이지 {i+1} ---\n{page_text}\n"
189
- except Exception as e:
190
- print(f"페이지 {i+1} OCR 오류: {str(e)}")
191
- continue
192
-
193
- return text
194
- except Exception as e:
195
- return f"PDF 텍스트 추출 오류: {str(e)}"
196
 
197
  def read_uploaded_file(file):
198
  if file is None:
@@ -200,62 +163,56 @@ def read_uploaded_file(file):
200
  try:
201
  file_ext = os.path.splitext(file.name)[1].lower()
202
 
203
-
204
-
205
- # Parquet 파일 처리
206
  if file_ext == '.parquet':
207
  try:
208
  table = pq.read_table(file.name)
209
  df = table.to_pandas()
210
 
211
- content = f"📊 Parquet 파일 분석:\n\n"
212
- content += f"1. 기본 정보:\n"
213
- content += f"- 전체 수: {len(df):,}개\n"
214
- content += f"- 전체 수: {len(df.columns)}개\n"
215
- content += f"- 메모리 사용량: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
216
 
217
- content += f"2. 컬럼 정보:\n"
218
  for col in df.columns:
219
  content += f"- {col} ({df[col].dtype})\n"
220
 
221
- content += f"\n3. 데이터 미리보기:\n"
222
- # tabulate 사용하여 테이블 형식으로 출력
223
  content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
224
 
225
- content += f"\n\n4. 결측치 정보:\n"
226
  null_counts = df.isnull().sum()
227
  for col, count in null_counts[null_counts > 0].items():
228
- content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
229
 
230
- # 수치형 컬럼에 대한 기본 통계
231
  numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
232
  if len(numeric_cols) > 0:
233
- content += f"\n5. 수치형 컬럼 통계:\n"
234
  stats_df = df[numeric_cols].describe()
235
  content += tabulate(stats_df, headers='keys', tablefmt='pipe')
236
 
237
  return content, "parquet"
238
  except Exception as e:
239
- return f"Parquet 파일 읽기 오류: {str(e)}", "error"
240
 
241
- # PDF 파일 처리
242
  if file_ext == '.pdf':
243
  try:
244
  pdf_reader = pypdf.PdfReader(file.name)
245
  total_pages = len(pdf_reader.pages)
246
 
247
- content = f"📑 PDF 문서 분석:\n\n"
248
- content += f"1. 기본 정보:\n"
249
- content += f"- 페이지 수: {total_pages}페이지\n"
250
 
251
- # 메타데이터 추출
252
  if pdf_reader.metadata:
253
- content += "\n2. 메타데이터:\n"
254
  for key, value in pdf_reader.metadata.items():
255
  if value and str(key).startswith('/'):
256
  content += f"- {key[1:]}: {value}\n"
257
 
258
- # 먼저 pdfminer로 텍스트 추출 시도
259
  try:
260
  text = extract_text(
261
  file.name,
@@ -269,117 +226,101 @@ def read_uploaded_file(file):
269
  except:
270
  text = ""
271
 
272
- # pdfminer로 추출 실패시 OCR 시도
273
  if not text.strip():
274
  text = extract_pdf_text_with_ocr(file.name)
275
 
276
- # 텍스트 분석
277
  if text:
278
  words = text.split()
279
  lines = text.split('\n')
280
- content += f"\n3. 텍스트 분석:\n"
281
- content += f"- 단어 수: {len(words):,}개\n"
282
- content += f"- 고유 단어 수: {len(set(words)):,}개\n"
283
- content += f"- 라인 수: {len(lines):,}개\n"
284
 
285
- # 본문 내용
286
- content += f"\n4. 본문 내용:\n"
287
- preview_length = min(2000, len(text)) # 미리보기 길이 증가
288
- content += f"--- 처음 {preview_length}자 ---\n"
289
  content += text[:preview_length]
290
  if len(text) > preview_length:
291
- content += f"\n... ( {len(text):,} 중 일부 표시)\n"
292
  else:
293
- content += "\n⚠️ 텍스트 추출 실패"
294
 
295
  return content, "pdf"
296
  except Exception as e:
297
- return f"PDF 파일 읽기 오류: {str(e)}", "error"
298
-
299
-
300
 
301
- # CSV 파일 처리
302
  elif file_ext == '.csv':
303
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
304
  for encoding in encodings:
305
  try:
306
  df = pd.read_csv(file.name, encoding=encoding)
307
- content = f"📊 CSV 파일 분석:\n\n"
308
- content += f"1. 기본 정보:\n"
309
- content += f"- 전체 수: {len(df):,}개\n"
310
- content += f"- 전체 수: {len(df.columns)}개\n"
311
- content += f"- 메모리 사용량: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
312
 
313
- content += f"2. 컬럼 정보:\n"
314
  for col in df.columns:
315
  content += f"- {col} ({df[col].dtype})\n"
316
 
317
- content += f"\n3. 데이터 미리보기:\n"
318
  content += df.head(5).to_markdown(index=False)
319
 
320
- content += f"\n\n4. 결측치 정보:\n"
321
  null_counts = df.isnull().sum()
322
  for col, count in null_counts[null_counts > 0].items():
323
- content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
324
 
325
  return content, "csv"
326
  except UnicodeDecodeError:
327
  continue
328
- raise UnicodeDecodeError(f"지원되는 인코딩으로 파일을 읽을 없습니다 ({', '.join(encodings)})")
329
 
330
- # 텍스트 파일 처리
331
  else:
332
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
333
  for encoding in encodings:
334
  try:
335
  with open(file.name, 'r', encoding=encoding) as f:
336
  content = f.read()
337
-
338
- # 파일 내용 분석
339
  lines = content.split('\n')
340
  total_lines = len(lines)
341
  non_empty_lines = len([line for line in lines if line.strip()])
342
 
343
- # 코드 파일 여부 확인
344
  is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
345
 
346
- analysis = f"\n📝 파일 분석:\n"
347
  if is_code:
348
- # 코드 파일 분석
349
  functions = len([line for line in lines if 'def ' in line])
350
  classes = len([line for line in lines if 'class ' in line])
351
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
352
 
353
- analysis += f"- 파일 유형: 코드\n"
354
- analysis += f"- 전체 라인 수: {total_lines:,}줄\n"
355
- analysis += f"- 함수 수: {functions}개\n"
356
- analysis += f"- 클래스 수: {classes}개\n"
357
- analysis += f"- import 수: {imports}개\n"
358
  else:
359
- # 일반 텍스트 파일 분석
360
  words = len(content.split())
361
  chars = len(content)
362
 
363
- analysis += f"- 파일 유형: 텍스트\n"
364
- analysis += f"- 전체 라인 수: {total_lines:,}줄\n"
365
- analysis += f"- 실제 내용이 있는 라인 수: {non_empty_lines:,}줄\n"
366
- analysis += f"- 단어 수: {words:,}개\n"
367
- analysis += f"- 문자 수: {chars:,}개\n"
368
 
369
  return content + analysis, "text"
370
  except UnicodeDecodeError:
371
  continue
372
- raise UnicodeDecodeError(f"지원되는 인코딩으로 파일을 읽을 없습니다 ({', '.join(encodings)})")
373
 
374
  except Exception as e:
375
- return f"파일 읽기 오류: {str(e)}", "error"
376
-
377
-
378
-
379
-
380
- # 파일 업로드 이벤트 핸들링 수정
381
- def init_msg():
382
- return "파일을 분석하고 있습니다..."
383
 
384
 
385
  CSS = """
 
128
 
129
  return relevant_contexts
130
 
131
+ def init_msg():
132
+ return "Analyzing file..."
133
+
134
  def analyze_file_content(content, file_type):
135
  """Analyze file content and return structural summary"""
136
  if file_type in ['parquet', 'csv']:
 
139
  header = lines[0]
140
  columns = header.count('|') - 1
141
  rows = len(lines) - 3
142
+ return f"📊 Dataset Structure: {columns} columns, {rows} rows"
143
  except:
144
+ return "❌ Failed to analyze dataset structure"
145
 
146
  lines = content.split('\n')
147
  total_lines = len(lines)
 
151
  functions = len([line for line in lines if 'def ' in line])
152
  classes = len([line for line in lines if 'class ' in line])
153
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
154
+ return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
155
 
156
  paragraphs = content.count('\n\n') + 1
157
  words = len(content.split())
158
+ return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  def read_uploaded_file(file):
161
  if file is None:
 
163
  try:
164
  file_ext = os.path.splitext(file.name)[1].lower()
165
 
166
+ # Parquet file processing
 
 
167
  if file_ext == '.parquet':
168
  try:
169
  table = pq.read_table(file.name)
170
  df = table.to_pandas()
171
 
172
+ content = f"📊 Parquet File Analysis:\n\n"
173
+ content += f"1. Basic Information:\n"
174
+ content += f"- Total Rows: {len(df):,}\n"
175
+ content += f"- Total Columns: {len(df.columns)}\n"
176
+ content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
177
 
178
+ content += f"2. Column Information:\n"
179
  for col in df.columns:
180
  content += f"- {col} ({df[col].dtype})\n"
181
 
182
+ content += f"\n3. Data Preview:\n"
 
183
  content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
184
 
185
+ content += f"\n\n4. Missing Values:\n"
186
  null_counts = df.isnull().sum()
187
  for col, count in null_counts[null_counts > 0].items():
188
+ content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
189
 
 
190
  numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
191
  if len(numeric_cols) > 0:
192
+ content += f"\n5. Numeric Column Statistics:\n"
193
  stats_df = df[numeric_cols].describe()
194
  content += tabulate(stats_df, headers='keys', tablefmt='pipe')
195
 
196
  return content, "parquet"
197
  except Exception as e:
198
+ return f"Error reading Parquet file: {str(e)}", "error"
199
 
200
+ # PDF file processing
201
  if file_ext == '.pdf':
202
  try:
203
  pdf_reader = pypdf.PdfReader(file.name)
204
  total_pages = len(pdf_reader.pages)
205
 
206
+ content = f"📑 PDF Document Analysis:\n\n"
207
+ content += f"1. Basic Information:\n"
208
+ content += f"- Total Pages: {total_pages}\n"
209
 
 
210
  if pdf_reader.metadata:
211
+ content += "\n2. Metadata:\n"
212
  for key, value in pdf_reader.metadata.items():
213
  if value and str(key).startswith('/'):
214
  content += f"- {key[1:]}: {value}\n"
215
 
 
216
  try:
217
  text = extract_text(
218
  file.name,
 
226
  except:
227
  text = ""
228
 
 
229
  if not text.strip():
230
  text = extract_pdf_text_with_ocr(file.name)
231
 
 
232
  if text:
233
  words = text.split()
234
  lines = text.split('\n')
235
+ content += f"\n3. Text Analysis:\n"
236
+ content += f"- Total Words: {len(words):,}\n"
237
+ content += f"- Unique Words: {len(set(words)):,}\n"
238
+ content += f"- Total Lines: {len(lines):,}\n"
239
 
240
+ content += f"\n4. Content Preview:\n"
241
+ preview_length = min(2000, len(text))
242
+ content += f"--- First {preview_length} characters ---\n"
 
243
  content += text[:preview_length]
244
  if len(text) > preview_length:
245
+ content += f"\n... (Showing partial content of {len(text):,} characters)\n"
246
  else:
247
+ content += "\n⚠️ Text extraction failed"
248
 
249
  return content, "pdf"
250
  except Exception as e:
251
+ return f"Error reading PDF file: {str(e)}", "error"
 
 
252
 
253
+ # CSV file processing
254
  elif file_ext == '.csv':
255
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
256
  for encoding in encodings:
257
  try:
258
  df = pd.read_csv(file.name, encoding=encoding)
259
+ content = f"📊 CSV File Analysis:\n\n"
260
+ content += f"1. Basic Information:\n"
261
+ content += f"- Total Rows: {len(df):,}\n"
262
+ content += f"- Total Columns: {len(df.columns)}\n"
263
+ content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
264
 
265
+ content += f"2. Column Information:\n"
266
  for col in df.columns:
267
  content += f"- {col} ({df[col].dtype})\n"
268
 
269
+ content += f"\n3. Data Preview:\n"
270
  content += df.head(5).to_markdown(index=False)
271
 
272
+ content += f"\n\n4. Missing Values:\n"
273
  null_counts = df.isnull().sum()
274
  for col, count in null_counts[null_counts > 0].items():
275
+ content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
276
 
277
  return content, "csv"
278
  except UnicodeDecodeError:
279
  continue
280
+ raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
281
 
282
+ # Text file processing
283
  else:
284
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
285
  for encoding in encodings:
286
  try:
287
  with open(file.name, 'r', encoding=encoding) as f:
288
  content = f.read()
289
+
 
290
  lines = content.split('\n')
291
  total_lines = len(lines)
292
  non_empty_lines = len([line for line in lines if line.strip()])
293
 
 
294
  is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
295
 
296
+ analysis = f"\n📝 File Analysis:\n"
297
  if is_code:
 
298
  functions = len([line for line in lines if 'def ' in line])
299
  classes = len([line for line in lines if 'class ' in line])
300
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
301
 
302
+ analysis += f"- File Type: Code\n"
303
+ analysis += f"- Total Lines: {total_lines:,}\n"
304
+ analysis += f"- Functions: {functions}\n"
305
+ analysis += f"- Classes: {classes}\n"
306
+ analysis += f"- Import Statements: {imports}\n"
307
  else:
 
308
  words = len(content.split())
309
  chars = len(content)
310
 
311
+ analysis += f"- File Type: Text\n"
312
+ analysis += f"- Total Lines: {total_lines:,}\n"
313
+ analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
314
+ analysis += f"- Word Count: {words:,}\n"
315
+ analysis += f"- Character Count: {chars:,}\n"
316
 
317
  return content + analysis, "text"
318
  except UnicodeDecodeError:
319
  continue
320
+ raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
321
 
322
  except Exception as e:
323
+ return f"Error reading file: {str(e)}", "error"
 
 
 
 
 
 
 
324
 
325
 
326
  CSS = """