Shirish15's picture
Update app.py
016f3e1 verified
import gradio as gr
from io import BytesIO
import docx
import re
def extract_citations_and_references(doc_file):
"""Extracts citations and references from a DOCX file."""
try:
if isinstance(doc_file, str): # Path to file
document = docx.Document(doc_file)
else: # BytesIO object
document = docx.Document(BytesIO(doc_file.read()))
except Exception as e:
return [], [], f"Error opening DOCX file: {e}"
text = ""
for paragraph in document.paragraphs:
text += paragraph.text + "\n"
citation_patterns = [
r'\(([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)\)', # (Author et al., date)
r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author and Author, date)
r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author & Author, date)
r'\(([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author, date)
r'([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)', # Author et al., date (outside parentheses)
r'([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author and Author, date (outside parentheses)
r'([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author & Author, date (outside parentheses)
r'([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author, date (general)
r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author and Author date)
r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author & Author date)
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date; Author date)
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date; Author date; Author date)
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)',
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)',
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date, Author date, Author date)
r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)', # (Author, date, Author, date, Author, date)
r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)' # (Author, date, Author, date)
]
citations = []
for pattern in citation_patterns:
matches = re.findall(pattern, text, re.IGNORECASE) #ADD IGNORECASE
# Extract the part inside the brackets/parentheses
for match in matches:
if isinstance(match, tuple): # Handle tuples from patterns with groups
# citations.append(match[0]) # Take only the first group
citations.extend(match) # take all group
else:
citations.append(match)
# Remove duplicates and empty strings
citations = list(set(citations))
citations = [c for c in citations if c]
# Filter out unwanted "citations" using regex patterns
filtered_citations = []
unwanted_patterns = [
r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}$", # Starts with keyword followed by a year
r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}[a-z]?$" # allow alphabet at the end of the year
]
for citation in citations:
valid = True
for pattern in unwanted_patterns:
if re.match(pattern, citation, re.IGNORECASE):
valid = False
break
if valid:
filtered_citations.append(citation)
# Sort citations alphabetically
filtered_citations.sort()
references = []
reference_section_started = False
for paragraph in document.paragraphs:
if "references" in paragraph.text.lower():
reference_section_started = True
continue # Skip the "References" heading itself
if reference_section_started:
references.append(paragraph.text)
references = list(set(references))
# Sort references alphabetically
references.sort()
return filtered_citations, references, None
def extract_author_from_citation(citation):
"""Extracts the author's name from a citation string.
This is a simplified extraction and might need refinement."""
# Split by comma or "et al." or "and" to isolate the author part
parts = re.split(r",| et al\.?| and | & ", citation)
if parts:
return parts[0].strip() # Return the first part as author
return citation # If no split, return the whole citation
def compare_citations_references(citations, references):
"""Compares citations and references to find missing or extra entries,
using author name matching."""
missing_citations = []
extra_references = []
reference_authors = set() # Collect author names from references
for reference in references:
# Extract the first author name from the reference
match = re.match(r"([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)?),", reference)
if match:
reference_authors.add(match.group(1).strip())
for citation in citations:
author = extract_author_from_citation(citation)
found = False
for ref_author in reference_authors:
if author.lower() in ref_author.lower() or ref_author.lower() in author.lower():
found = True
break
if not found:
missing_citations.append(citation)
# While this code identifies extra references, author matching isn't relevant here.
# This is because extra references by definition are those not cited *at all*
return missing_citations, extra_references
def analyze_document(doc_file):
"""Analyzes the document and returns results for the Gradio interface."""
citations, references, error = extract_citations_and_references(doc_file)
if error:
return error, "", "", ""
missing, extra = compare_citations_references(citations, references)
return (
"Extracted Citations:\n" + "\n".join(citations),
"Extracted References:\n" + "\n".join(references),
"Missing Citations:\n" + "\n".join(missing),
"Extra References:\n" + "\n".join(references), # Display extra references
"Missing Citations (Author Match):\n" + "\n".join(missing)
)
# Gradio Interface Definition
iface = gr.Interface(
fn=analyze_document,
inputs=gr.File(file_types=[".docx"]),
outputs=[
gr.Textbox(label="Citations"),
gr.Textbox(label="References"),
gr.Textbox(label="Missing Citations"),
gr.Textbox(label="References that aren't cited"),
gr.Textbox(label="Missing Citations (Author Match)")
],
title="Citation Analysis Tool",
description="Upload a DOCX file to extract and compare citations and references."
)
iface.launch()