import gradio as gr from io import BytesIO import docx import re def extract_citations_and_references(doc_file): """Extracts citations and references from a DOCX file.""" try: if isinstance(doc_file, str): # Path to file document = docx.Document(doc_file) else: # BytesIO object document = docx.Document(BytesIO(doc_file.read())) except Exception as e: return [], [], f"Error opening DOCX file: {e}" text = "" for paragraph in document.paragraphs: text += paragraph.text + "\n" citation_patterns = [ r'\(([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)\)', # (Author et al., date) r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author and Author, date) r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author & Author, date) r'\(([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)\)', # (Author, date) r'([A-Z][a-zA-Z]+ et al\.?,? \d{4}[a-zA-Z]?)', # Author et al., date (outside parentheses) r'([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author and Author, date (outside parentheses) r'([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author & Author, date (outside parentheses) r'([A-Z][a-zA-Z]+,? \d{4}[a-zA-Z]?)', # Author, date (general) r'\(([A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author and Author date) r'\(([A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author & Author date) r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date; Author date) r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?; [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date; Author date; Author date) r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', r'\(([A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+ \d{4}[a-zA-Z]?)\)', # (Author date, Author date, Author date) r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)', # (Author, date, Author, date, Author, date) r'\(([A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?, [A-Z][a-zA-Z]+, \d{4}[a-zA-Z]?)\)' # (Author, date, Author, date) ] citations = [] for pattern in citation_patterns: matches = re.findall(pattern, text, re.IGNORECASE) #ADD IGNORECASE # Extract the part inside the brackets/parentheses for match in matches: if isinstance(match, tuple): # Handle tuples from patterns with groups # citations.append(match[0]) # Take only the first group citations.extend(match) # take all group else: citations.append(match) # Remove duplicates and empty strings citations = list(set(citations)) citations = [c for c in citations if c] # Filter out unwanted "citations" using regex patterns filtered_citations = [] unwanted_patterns = [ r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}$", # Starts with keyword followed by a year r"^(and|between|by|from|in|to|is|for|period|around|January|February|March|April|May|June|July|August|September|October|November|December) \d{4}[a-z]?$" # allow alphabet at the end of the year ] for citation in citations: valid = True for pattern in unwanted_patterns: if re.match(pattern, citation, re.IGNORECASE): valid = False break if valid: filtered_citations.append(citation) # Sort citations alphabetically filtered_citations.sort() references = [] reference_section_started = False for paragraph in document.paragraphs: if "references" in paragraph.text.lower(): reference_section_started = True continue # Skip the "References" heading itself if reference_section_started: references.append(paragraph.text) references = list(set(references)) # Sort references alphabetically references.sort() return filtered_citations, references, None def extract_author_from_citation(citation): """Extracts the author's name from a citation string. This is a simplified extraction and might need refinement.""" # Split by comma or "et al." or "and" to isolate the author part parts = re.split(r",| et al\.?| and | & ", citation) if parts: return parts[0].strip() # Return the first part as author return citation # If no split, return the whole citation def compare_citations_references(citations, references): """Compares citations and references to find missing or extra entries, using author name matching.""" missing_citations = [] extra_references = [] reference_authors = set() # Collect author names from references for reference in references: # Extract the first author name from the reference match = re.match(r"([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)?),", reference) if match: reference_authors.add(match.group(1).strip()) for citation in citations: author = extract_author_from_citation(citation) found = False for ref_author in reference_authors: if author.lower() in ref_author.lower() or ref_author.lower() in author.lower(): found = True break if not found: missing_citations.append(citation) # While this code identifies extra references, author matching isn't relevant here. # This is because extra references by definition are those not cited *at all* return missing_citations, extra_references def analyze_document(doc_file): """Analyzes the document and returns results for the Gradio interface.""" citations, references, error = extract_citations_and_references(doc_file) if error: return error, "", "", "" missing, extra = compare_citations_references(citations, references) return ( "Extracted Citations:\n" + "\n".join(citations), "Extracted References:\n" + "\n".join(references), "Missing Citations:\n" + "\n".join(missing), "Extra References:\n" + "\n".join(references), # Display extra references "Missing Citations (Author Match):\n" + "\n".join(missing) ) # Gradio Interface Definition iface = gr.Interface( fn=analyze_document, inputs=gr.File(file_types=[".docx"]), outputs=[ gr.Textbox(label="Citations"), gr.Textbox(label="References"), gr.Textbox(label="Missing Citations"), gr.Textbox(label="References that aren't cited"), gr.Textbox(label="Missing Citations (Author Match)") ], title="Citation Analysis Tool", description="Upload a DOCX file to extract and compare citations and references." ) iface.launch()