Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """AI-Powered Research Assistant for Scholars and Researchers.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb | |
| """ | |
| # !pip install gradio requests transformers beautifulsoup4 python-docx torch | |
| """**Set Up the Environment:** Install the required libraries | |
| **Create the Gradio Frontend:** searching for articles, summarizing content, generating citations | |
| """ | |
| import gradio as gr | |
| import requests | |
| from transformers import pipeline | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| def search_related_articles_crossref(query, max_results=3): | |
| """Search for related articles using CrossRef API.""" | |
| try: | |
| url = f"https://api.crossref.org/works?query={query}&rows={max_results}" | |
| headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email | |
| response = requests.get(url, headers=headers) | |
| if response.status_code == 200: | |
| articles = [] | |
| data = response.json() | |
| for item in data['message']['items']: | |
| title = item.get('title', ['No Title'])[0] | |
| doi = item.get('DOI', 'No DOI') | |
| link = f"https://doi.org/{doi}" | |
| articles.append({"title": title, "link": link}) | |
| print(articles) | |
| if not articles: | |
| print(articles) | |
| return [], "No articles found for the query." | |
| return articles, None | |
| else: | |
| return [], f"Error fetching articles: {response.status_code} - {response.text}" | |
| except Exception as e: | |
| return [], f"Exception during CrossRef API call: {str(e)}" | |
| from bs4 import BeautifulSoup | |
| def extract_text_from_html(url): | |
| """Extract text content from HTML page.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() # Check for request errors | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # This is a simplified example. You may need to adjust the selector based on the site structure. | |
| paragraphs = soup.find_all('p') | |
| text_content = "\n".join([para.get_text() for para in paragraphs]) | |
| return text_content | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") | |
| def summarize_article(article_text): | |
| """Summarize a given article's text.""" | |
| try: | |
| if not article_text or len(article_text.split()) < 20: | |
| return None, "Article content is too short to summarize." | |
| # Ensure the input text is not too long | |
| inputs = tokenizer( | |
| article_text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512, # Adjust max_length to control input size | |
| padding="max_length" | |
| ) | |
| # Generate the summary | |
| summary_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=400, # Limit the length of the output | |
| min_length=100, # Set a minimum length for the output | |
| # #length_penalty='1.0', # Adjust length penalty to encourage longer output | |
| # no_repeat_ngram_size=3, # Avoid repetition of phrases | |
| early_stopping=True | |
| ) | |
| # Decode the output to get the summary | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary, None | |
| except Exception as e: | |
| return None, f"Exception during summarization: {str(e)}" | |
| # Load tokenizer and model | |
| tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5") | |
| model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5") | |
| def generate_citation_t5(article_title, citation_style, article_link): | |
| """Generate a citation using the T5 or LED model.""" | |
| try: | |
| # Prepare the input text with explicit and structured formatting | |
| input_text = (f"'{article_title}'\n" | |
| f"{article_link}\n" | |
| f"Include author names, publication date, title, journal name, and DOI if available.\n" | |
| f"Generate a {citation_style} style citation for the article") | |
| # Tokenize the input | |
| inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True) | |
| # Generate the citation | |
| outputs = model_t5.generate(**inputs, max_new_tokens=70) | |
| # Decode the output to text | |
| citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True) | |
| return citation, None | |
| except Exception as e: | |
| return None, f"Exception during citation generation: {str(e)}" | |
| from docx import Document | |
| from docx.shared import Pt | |
| from docx.oxml.ns import qn | |
| def create_thesis_document(title, summary, citations): | |
| """Create a Word document formatted like a PhD thesis.""" | |
| # Initialize Document | |
| doc = Document() | |
| # Title Page | |
| doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment | |
| doc.add_paragraph() # Add empty line | |
| # Adding title page details | |
| doc.add_paragraph('Thesis', style='Heading 1').alignment = 1 | |
| doc.add_paragraph('Author Name', style='Normal').alignment = 1 | |
| doc.add_paragraph('University Name', style='Normal').alignment = 1 | |
| doc.add_paragraph('Date', style='Normal').alignment = 1 | |
| doc.add_page_break() | |
| # Summary Page | |
| doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment | |
| doc.add_paragraph(summary, style='Normal') | |
| doc.add_page_break() | |
| # Citation Page | |
| doc.add_paragraph('Citations', style='Heading 1').alignment = 0 | |
| for citation in citations: | |
| doc.add_paragraph(citation, style='Normal') | |
| file_path = "Research_Document.docx" | |
| doc.save(file_path) | |
| return file_path | |
| def research_assistant(research_topic, citation_style): | |
| """Main function to search, summarize, and generate citations.""" | |
| if not research_topic: | |
| return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."] | |
| # Character limit check | |
| if len(research_topic) > 150: | |
| return "Error: Research topic exceeds 150 characters.", [], [] | |
| # Search for related articles using CrossRef | |
| articles, error = search_related_articles_crossref(research_topic) | |
| if error: | |
| return error, [], [] | |
| summaries = [] | |
| citations = [] | |
| article_content = '' | |
| for article in articles: | |
| try: | |
| # Fetching article content might not be feasible; consider using metadata | |
| article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods | |
| citation, error = generate_citation_t5(article['title'], citation_style, article['link']) | |
| if error: | |
| citations.append(f"Error generating citation for '{article['title']}': {error}") | |
| else: | |
| citations.append(citation) | |
| except Exception as e: | |
| summaries.append(f"Error processing article '{article['title']}': {str(e)}") | |
| citations.append(f"Error generating citation for '{article['title']}': {str(e)}") | |
| summary, error = summarize_article(article_content) | |
| if error: | |
| summaries.append(f"Error summarizing article: {error}") | |
| else: | |
| summaries.append(summary) | |
| file_path = create_thesis_document(research_topic, "\n".join(summaries), citations) | |
| return research_topic, summaries, citations, file_path | |
| # Create Gradio Interface with download functionality | |
| gr_interface = gr.Interface( | |
| fn=research_assistant, | |
| inputs=[ | |
| gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"), | |
| gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Research Topic"), | |
| gr.Textbox(label="Summaries of Articles"), | |
| gr.Textbox(label="Generated Citations"), | |
| gr.DownloadButton(label="Download Document") | |
| ], | |
| title="AI-Powered Research Assistant", | |
| allow_flagging="never" | |
| ) | |
| gr_interface.launch(share=True) |