Spaces:

smfaiz
/

research-assistant

Sleeping

App Files Files Community

research-assistant / app.py

smfaiz

Update app.py

abf24f3 verified over 1 year ago

raw

history blame contribute delete

8.42 kB

	# -- coding: utf-8 --
	"""AI-Powered Research Assistant for Scholars and Researchers.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
	"""

	# !pip install gradio requests transformers beautifulsoup4 python-docx torch

	"""Set Up the Environment: Install the required libraries

	Create the Gradio Frontend: searching for articles, summarizing content, generating citations
	"""

	import gradio as gr
	import requests
	from transformers import pipeline

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	def search_related_articles_crossref(query, max_results=3):
	"""Search for related articles using CrossRef API."""
	try:
	url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
	headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email
	response = requests.get(url, headers=headers)

	if response.status_code == 200:
	articles = []
	data = response.json()
	for item in data['message']['items']:
	title = item.get('title', ['No Title'])[0]
	doi = item.get('DOI', 'No DOI')
	link = f"https://doi.org/{doi}"
	articles.append({"title": title, "link": link})

	print(articles)
	if not articles:
	print(articles)
	return [], "No articles found for the query."
	return articles, None
	else:
	return [], f"Error fetching articles: {response.status_code} - {response.text}"
	except Exception as e:
	return [], f"Exception during CrossRef API call: {str(e)}"

	from bs4 import BeautifulSoup

	def extract_text_from_html(url):
	"""Extract text content from HTML page."""
	try:
	response = requests.get(url)
	response.raise_for_status() # Check for request errors
	soup = BeautifulSoup(response.text, 'html.parser')

	# This is a simplified example. You may need to adjust the selector based on the site structure.
	paragraphs = soup.find_all('p')
	text_content = "\n".join([para.get_text() for para in paragraphs])

	return text_content
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
	model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")

	def summarize_article(article_text):
	"""Summarize a given article's text."""
	try:
	if not article_text or len(article_text.split()) < 20:
	return None, "Article content is too short to summarize."
	# Ensure the input text is not too long
	inputs = tokenizer(
	article_text,
	return_tensors="pt",
	truncation=True,
	max_length=512, # Adjust max_length to control input size
	padding="max_length"
	)

	# Generate the summary
	summary_ids = model.generate(
	**inputs,
	max_new_tokens=400, # Limit the length of the output
	min_length=100, # Set a minimum length for the output
	# #length_penalty='1.0', # Adjust length penalty to encourage longer output
	# no_repeat_ngram_size=3, # Avoid repetition of phrases
	early_stopping=True
	)

	# Decode the output to get the summary
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	return summary, None
	except Exception as e:
	return None, f"Exception during summarization: {str(e)}"

	# Load tokenizer and model
	tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
	model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")

	def generate_citation_t5(article_title, citation_style, article_link):
	"""Generate a citation using the T5 or LED model."""
	try:
	# Prepare the input text with explicit and structured formatting
	input_text = (f"'{article_title}'\n"
	f"{article_link}\n"
	f"Include author names, publication date, title, journal name, and DOI if available.\n"
	f"Generate a {citation_style} style citation for the article")

	# Tokenize the input
	inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True)

	# Generate the citation
	outputs = model_t5.generate(**inputs, max_new_tokens=70)

	# Decode the output to text
	citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
	return citation, None
	except Exception as e:
	return None, f"Exception during citation generation: {str(e)}"

	from docx import Document
	from docx.shared import Pt
	from docx.oxml.ns import qn

	def create_thesis_document(title, summary, citations):
	"""Create a Word document formatted like a PhD thesis."""

	# Initialize Document
	doc = Document()

	# Title Page
	doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment
	doc.add_paragraph() # Add empty line

	# Adding title page details
	doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
	doc.add_paragraph('Author Name', style='Normal').alignment = 1
	doc.add_paragraph('University Name', style='Normal').alignment = 1
	doc.add_paragraph('Date', style='Normal').alignment = 1

	doc.add_page_break()

	# Summary Page
	doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment
	doc.add_paragraph(summary, style='Normal')

	doc.add_page_break()

	# Citation Page
	doc.add_paragraph('Citations', style='Heading 1').alignment = 0

	for citation in citations:
	doc.add_paragraph(citation, style='Normal')

	file_path = "Research_Document.docx"
	doc.save(file_path)
	return file_path

	def research_assistant(research_topic, citation_style):
	"""Main function to search, summarize, and generate citations."""
	if not research_topic:
	return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]

	# Character limit check
	if len(research_topic) > 150:
	return "Error: Research topic exceeds 150 characters.", [], []

	# Search for related articles using CrossRef
	articles, error = search_related_articles_crossref(research_topic)

	if error:
	return error, [], []

	summaries = []
	citations = []
	article_content = ''

	for article in articles:
	try:
	# Fetching article content might not be feasible; consider using metadata
	article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods

	citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
	if error:
	citations.append(f"Error generating citation for '{article['title']}': {error}")
	else:
	citations.append(citation)

	except Exception as e:
	summaries.append(f"Error processing article '{article['title']}': {str(e)}")
	citations.append(f"Error generating citation for '{article['title']}': {str(e)}")

	summary, error = summarize_article(article_content)
	if error:
	summaries.append(f"Error summarizing article: {error}")
	else:
	summaries.append(summary)

	file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
	return research_topic, summaries, citations, file_path

	# Create Gradio Interface with download functionality
	gr_interface = gr.Interface(
	fn=research_assistant,
	inputs=[
	gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"),
	gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
	],
	outputs=[
	gr.Textbox(label="Research Topic"),
	gr.Textbox(label="Summaries of Articles"),
	gr.Textbox(label="Generated Citations"),
	gr.DownloadButton(label="Download Document")
	],
	title="AI-Powered Research Assistant",
	allow_flagging="never"
	)

	gr_interface.launch(share=True)