import os import glob import PyPDF2 from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import zipfile import shutil def read_pdf(file_path): """ Read text content from a PDF file. Args: file_path (str): Path to the PDF file. Returns: str: Text content of the PDF. """ text = "" with open(file_path, "rb") as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): text += reader.pages[page_num].extract_text() return text def text_similarity(text1, text2): """ Compute the cosine similarity between two texts. Args: text1 (str): The first text. text2 (str): The second text. Returns: float: The cosine similarity between the two texts. """ # Create a CountVectorizer instance vectorizer = CountVectorizer().fit_transform([text1, text2]) # Calculate cosine similarity similarity = cosine_similarity(vectorizer) # Since there are only 2 texts, similarity[0, 1] or similarity[1, 0] gives the similarity return similarity[0, 1] def compare_pdfs(pdf_file1, pdf_file2): """ Compare two PDF files for similarity. Args: pdf_file1 (str): Path to the first PDF file. pdf_file2 (str): Path to the second PDF file. """ text1 = read_pdf(pdf_file1) text2 = read_pdf(pdf_file2) file1 = pdf_file1.split('/')[-1] file2 = pdf_file2.split('/')[-1] similarity_score = text_similarity(text1, text2) if similarity_score > 0.75: print(f"Similarity between '{file1}' and '{file2}': {similarity_score}") if similarity_score > 0.9: print(f"Complete plagiarism detected between '{file1}' and '{file2}'!") else: print(f"Potential plagiarism detected between '{file1}' and '{file2}'!") def main(folder_or_zip_path): """ Main function to compare PDF files either in a folder or within a zip file. Args: folder_or_zip_path (str): Path to the folder containing PDF files or to the zip file. """ if folder_or_zip_path.endswith('.zip'): # Unzip the file output_folder = '/Users/rishit/Documents/innovate_you/plagiarism_detection/plagiarism_rishit/zip_outputs' unzipped_folder = unzip_file(folder_or_zip_path, output_folder) folder_path = os.path.join(unzipped_folder, 'pdfs') else: folder_path = folder_or_zip_path # Get all PDF files in the folder pdf_files = glob.glob(os.path.join(folder_path, "*.pdf")) num_files = len(pdf_files) print(f"Found {num_files} PDF files in the folder.") if num_files == 0: print("No PDF files found in the specified folder.") return # Compare similarity for all pairs of PDF files for i in range(num_files): for j in range(i+1, num_files): compare_pdfs(pdf_files[i], pdf_files[j]) def unzip_file(zip_file, output_folder): """ Unzip a zip file to the specified output folder. Args: zip_file (str): Path to the zip file. output_folder (str): Path to the output folder where the contents will be extracted. Returns: str: Path to the folder containing the extracted files. """ # Create the output folder if it doesn't exist os.makedirs(output_folder, exist_ok=True) # Empty the output folder if it already exists if os.path.exists(output_folder): shutil.rmtree(output_folder) # Extract the zip file with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(output_folder) return output_folder # Example usage: input_path = input("Enter the path to the folder containing PDF files or to the zip file: ").strip() main(input_path)