awacke1 commited on
Commit
ad5d2a3
·
verified ·
1 Parent(s): 81a7060

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -88
app.py CHANGED
@@ -309,92 +309,80 @@ def parse_arxiv_refs(ref_text: str):
309
  Returns list of dicts with paper details, limited to 20 papers.
310
  Returns empty list if parsing fails.
311
  """
312
- if not ref_text:
313
- return []
314
-
315
- # Split on the paper header pattern
316
- papers = re.split(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
317
- headers = re.findall(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
318
-
319
- results = []
320
- for i, (header, content) in enumerate(zip(headers, papers[1:])):
321
- if i >= 20: # Limit to 20 papers
322
- break
323
-
324
- # Parse header parts
325
- header_parts = [p.strip() for p in header.strip('*').split('|')]
326
- if len(header_parts) >= 2:
327
- date_str = header_parts[0].strip()
328
- title = header_parts[1].strip()
329
-
330
- # Parse content into authors and summary
331
- content_parts = content.strip().split('\n', 1)
332
- authors = content_parts[0].strip('*') if content_parts else ""
333
- summary = content_parts[1].strip() if len(content_parts) > 1 else ""
334
-
335
- # Extract year from date
336
- year_match = re.search(r'20\d{2}', date_str)
337
- year = int(year_match.group(0)) if year_match else None
338
 
339
- results.append({
340
- 'title': title,
341
- 'summary': summary,
342
- 'authors': authors,
343
- 'year': year,
344
- 'date': date_str
345
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
348
  titles_summary=True, full_audio=False):
349
  """Perform Arxiv search and generate audio summaries."""
350
  start = time.time()
351
 
352
- # 🎯 1) Query the HF RAG pipeline
353
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
354
  refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
355
  r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
356
 
357
- # 🎯 2) Combine for final text output
358
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
359
  st.markdown(result)
360
 
361
- # 🎯 3) Generate "all at once" audio if requested
362
- if full_audio:
363
- complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
364
- audio_file_full = speak_with_edge_tts(complete_text)
365
- st.write("### 📚 Full Audio")
366
- play_and_download_audio(audio_file_full)
367
-
368
- if vocal_summary:
369
- main_text = clean_for_speech(r2)
370
- audio_file_main = speak_with_edge_tts(main_text)
371
- st.write("### 🎙 Short Audio")
372
- play_and_download_audio(audio_file_main)
373
-
374
- if extended_refs:
375
- summaries_text = "Extended references: " + refs.replace('"','')
376
- summaries_text = clean_for_speech(summaries_text)
377
- audio_file_refs = speak_with_edge_tts(summaries_text)
378
- st.write("### 📜 Long Refs")
379
- play_and_download_audio(audio_file_refs)
380
-
381
- # --------------------------------------
382
- # NEW: Parse references, show sorted list
383
- # --------------------------------------
384
  parsed_refs = parse_arxiv_refs(refs)
 
 
 
 
385
 
386
- # Sort by year descending (put None at bottom)
387
- # If you want to skip older than 2022, you can filter them:
388
- # parsed_refs = [r for r in parsed_refs if (r["year"] is not None and r["year"] >= 2022)]
389
- parsed_refs.sort(key=lambda x: x["year"] if x["year"] else 0, reverse=True)
390
-
391
- st.write("## Individual Papers (Most Recent First)")
392
  for idx, paper in enumerate(parsed_refs):
393
- year_str = paper["year"] if paper["year"] else "Unknown Year"
394
- st.markdown(f"**{idx+1}. {paper['title']}** \n*Year:* {year_str}")
395
- st.markdown(f"*Summary:* {paper['summary']}")
396
 
397
- # Two new TTS buttons: Title only or Title+Summary
398
  colA, colB = st.columns(2)
399
  with colA:
400
  if st.button(f"🔊 Title", key=f"title_{idx}"):
@@ -403,34 +391,17 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
403
  play_and_download_audio(audio_file_title)
404
 
405
  with colB:
406
- if st.button(f"🔊 Title+Summary", key=f"summary_{idx}"):
407
- text_tts = clean_for_speech(paper['title'] + ". " + paper['summary'])
408
  audio_file_title_summary = speak_with_edge_tts(text_tts)
409
  play_and_download_audio(audio_file_title_summary)
410
 
411
  st.write("---")
412
 
413
- # Keep your original block for "Titles Only" if you want:
414
- if titles_summary:
415
- # This is your existing code block
416
- titles = []
417
- for line in refs.split('\n'):
418
- m = re.search(r"\[([^\]]+)\]", line)
419
- if m:
420
- titles.append(m.group(1))
421
- if titles:
422
- titles_text = "Titles: " + ", ".join(titles)
423
- titles_text = clean_for_speech(titles_text)
424
- audio_file_titles = speak_with_edge_tts(titles_text)
425
- st.write("### 🔖 Titles (All-In-One)")
426
- play_and_download_audio(audio_file_titles)
427
-
428
  elapsed = time.time()-start
429
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
430
-
431
- # Always create a file with the result
432
  create_file(q, result, "md")
433
-
434
  return result
435
 
436
  def process_with_gpt(text):
 
309
  Returns list of dicts with paper details, limited to 20 papers.
310
  Returns empty list if parsing fails.
311
  """
312
+ try:
313
+ if not ref_text:
314
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ # Split on the paper header pattern
317
+ papers = re.split(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
318
+ headers = re.findall(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
319
+
320
+ results = []
321
+ for i, (header, content) in enumerate(zip(headers, papers[1:])):
322
+ if i >= 20: # Limit to 20 papers
323
+ break
324
+
325
+ try:
326
+ # Parse header parts
327
+ header_parts = [p.strip() for p in header.strip('*').split('|')]
328
+ if len(header_parts) >= 2:
329
+ date_str = header_parts[0].strip()
330
+ title = header_parts[1].strip()
331
+
332
+ # Parse content into authors and summary
333
+ content_parts = content.strip().split('\n', 1)
334
+ authors = content_parts[0].strip('*') if content_parts else ""
335
+ summary = content_parts[1].strip() if len(content_parts) > 1 else ""
336
+
337
+ # Extract year from date
338
+ year_match = re.search(r'20\d{2}', date_str)
339
+ year = int(year_match.group(0)) if year_match else None
340
+
341
+ results.append({
342
+ 'title': title,
343
+ 'summary': summary,
344
+ 'authors': authors,
345
+ 'year': year,
346
+ 'date': date_str
347
+ })
348
+ except Exception as e:
349
+ st.warning(f"Error parsing paper {i+1}: {str(e)}")
350
+ continue
351
+
352
+ return results
353
+ except Exception as e:
354
+ st.error(f"Error parsing papers: {str(e)}")
355
+ return []
356
 
357
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
358
  titles_summary=True, full_audio=False):
359
  """Perform Arxiv search and generate audio summaries."""
360
  start = time.time()
361
 
362
+ # Query the HF RAG pipeline
363
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
364
  refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
365
  r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
366
 
367
+ # Combine for final text output
368
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
369
  st.markdown(result)
370
 
371
+ # Parse references
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  parsed_refs = parse_arxiv_refs(refs)
373
+
374
+ # Sort only if we have results
375
+ if parsed_refs:
376
+ parsed_refs.sort(key=lambda x: x.get("year", 0) if x.get("year") else 0, reverse=True)
377
 
378
+ # Display papers
379
+ st.write("## Research Papers")
 
 
 
 
380
  for idx, paper in enumerate(parsed_refs):
381
+ st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
382
+ st.markdown(f"*{paper['authors']}*")
383
+ st.markdown(paper['summary'])
384
 
385
+ # Audio controls
386
  colA, colB = st.columns(2)
387
  with colA:
388
  if st.button(f"🔊 Title", key=f"title_{idx}"):
 
391
  play_and_download_audio(audio_file_title)
392
 
393
  with colB:
394
+ if st.button(f"🔊 Full Details", key=f"summary_{idx}"):
395
+ text_tts = clean_for_speech(f"{paper['title']} by {paper['authors']}. {paper['summary']}")
396
  audio_file_title_summary = speak_with_edge_tts(text_tts)
397
  play_and_download_audio(audio_file_title_summary)
398
 
399
  st.write("---")
400
 
401
+ # Rest of your existing function...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  elapsed = time.time()-start
403
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
 
 
404
  create_file(q, result, "md")
 
405
  return result
406
 
407
  def process_with_gpt(text):