|
import gradio as gr |
|
from io import BytesIO |
|
import docx |
|
import re |
|
|
|
def extract_citations_and_references(doc_file): |
|
"""Extracts citations and references from a DOCX file.""" |
|
try: |
|
if isinstance(doc_file, str): |
|
document = docx.Document(doc_file) |
|
else: |
|
document = docx.Document(BytesIO(doc_file.read())) |
|
except Exception as e: |
|
return [], [], f"Error opening DOCX file: {e}" |
|
|
|
text = "" |
|
for paragraph in document.paragraphs: |
|
text += paragraph.text + "\n" |
|
|
|
citation_patterns = [ |
|
r'\(([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', |
|
r'([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)', |
|
r'([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', |
|
r'([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', |
|
r'([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', |
|
|
|
r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
|
|
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
|
|
r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)', |
|
r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)' |
|
] |
|
|
|
citations = [] |
|
for pattern in citation_patterns: |
|
matches = re.findall(pattern, text, re.IGNORECASE) |
|
|
|
for match in matches: |
|
if isinstance(match, tuple): |
|
|
|
citations.extend(match) |
|
else: |
|
citations.append(match) |
|
|
|
citations = list(set(citations)) |
|
citations = [c for c in citations if c] |
|
|
|
|
|
filtered_citations = [] |
|
unwanted_patterns = [ |
|
r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}$", |
|
r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}[a-z]?$" |
|
] |
|
|
|
for citation in citations: |
|
valid = True |
|
for pattern in unwanted_patterns: |
|
if re.match(pattern, citation, re.IGNORECASE): |
|
valid = False |
|
break |
|
if valid: |
|
filtered_citations.append(citation) |
|
|
|
|
|
filtered_citations.sort() |
|
|
|
references = [] |
|
reference_section_started = False |
|
for paragraph in document.paragraphs: |
|
if "references" in paragraph.text.lower(): |
|
reference_section_started = True |
|
continue |
|
|
|
if reference_section_started: |
|
references.append(paragraph.text) |
|
|
|
references = list(set(references)) |
|
|
|
references.sort() |
|
|
|
return filtered_citations, references, None |
|
|
|
|
|
def extract_author_from_citation(citation): |
|
"""Extracts the author's name from a citation string. |
|
This is a simplified extraction and might need refinement.""" |
|
|
|
parts = re.split(r",| et al\.?| and | & ", citation) |
|
if parts: |
|
return parts[0].strip() |
|
return citation |
|
|
|
|
|
|
|
def compare_citations_references(citations, references): |
|
"""Compares citations and references to find missing or extra entries, |
|
using author name matching.""" |
|
missing_citations = [] |
|
extra_references = [] |
|
|
|
reference_authors = set() |
|
|
|
for reference in references: |
|
|
|
match = re.match(r"([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)?),", reference) |
|
if match: |
|
reference_authors.add(match.group(1).strip()) |
|
|
|
|
|
for citation in citations: |
|
author = extract_author_from_citation(citation) |
|
found = False |
|
for ref_author in reference_authors: |
|
if author.lower() in ref_author.lower() or ref_author.lower() in author.lower(): |
|
found = True |
|
break |
|
if not found: |
|
missing_citations.append(citation) |
|
|
|
|
|
|
|
|
|
return missing_citations, extra_references |
|
|
|
def analyze_document(doc_file): |
|
"""Analyzes the document and returns results for the Gradio interface.""" |
|
citations, references, error = extract_citations_and_references(doc_file) |
|
|
|
if error: |
|
return error, "", "", "" |
|
|
|
missing, extra = compare_citations_references(citations, references) |
|
|
|
return ( |
|
"Extracted Citations:\n" + "\n".join(citations), |
|
"Extracted References:\n" + "\n".join(references), |
|
"Missing Citations:\n" + "\n".join(missing), |
|
"Extra References:\n" + "\n".join(references), |
|
"Missing Citations (Author Match):\n" + "\n".join(missing) |
|
) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=analyze_document, |
|
inputs=gr.File(file_types=[".docx"]), |
|
outputs=[ |
|
gr.Textbox(label="Citations"), |
|
gr.Textbox(label="References"), |
|
gr.Textbox(label="Missing Citations"), |
|
gr.Textbox(label="References that aren't cited"), |
|
gr.Textbox(label="Missing Citations (Author Match)") |
|
], |
|
title="Citation Analysis Tool", |
|
description="Upload a DOCX file to extract and compare citations and references." |
|
) |
|
|
|
iface.launch() |