Spaces:

Eemansleepdeprived
/

okulary

Build error

App Files Files Community

okulary / plagiarism_detection /plagapp.py.py

Eemansleepdeprived

Upload 310 files

36eb7b3 verified 5 months ago

raw

history blame contribute delete

7.25 kB

	import os
	import glob
	import PyPDF2
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import zipfile
	import shutil
	import streamlit as st
	from zipfile import ZipFile
	from PyPDF2 import PdfReader
	from difflib import SequenceMatcher

	# Color Scheme
	PAGE_BG_COLOR = "#8CB9BD"
	CONTENT_BG_COLOR = "#ECB159"
	TEXT_COLOR = "#ECB159"

	def calculate_similarity(text1, text2):
	return SequenceMatcher(None, text1, text2).ratio()

	def extract_text_from_pdf(file):
	pdf_reader = PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def process_zip(zip_file):
	with ZipFile(zip_file, 'r') as zip_ref:
	texts = []
	for file_name in zip_ref.namelist():
	if file_name.endswith('.pdf'):
	with zip_ref.open(file_name) as file:
	text = extract_text_from_pdf(file)
	texts.append(text)
	return texts

	def read_pdf(file_path):
	"""
	Read text content from a PDF file.

	Args:
	file_path (str): Path to the PDF file.

	Returns:
	str: Text content of the PDF.
	"""
	text = ""
	with open(file_path, "rb") as file:
	reader = PyPDF2.PdfReader(file)
	for page_num in range(len(reader.pages)):
	text += reader.pages[page_num].extract_text()
	return text

	def text_similarity(text1, text2):
	"""
	Compute the cosine similarity between two texts.

	Args:
	text1 (str): The first text.
	text2 (str): The second text.

	Returns:
	float: The cosine similarity between the two texts.
	"""
	# Create a CountVectorizer instance
	vectorizer = CountVectorizer().fit_transform([text1, text2])

	# Calculate cosine similarity
	similarity = cosine_similarity(vectorizer)

	# Since there are only 2 texts, similarity[0, 1] or similarity[1, 0] gives the similarity
	return similarity[0, 1]

	def compare_pdfs(pdf_file1, pdf_file2):
	"""
	Compare two PDF files for similarity.

	Args:
	pdf_file1 (str): Path to the first PDF file.
	pdf_file2 (str): Path to the second PDF file.
	"""
	text1 = read_pdf(pdf_file1)
	text2 = read_pdf(pdf_file2)
	file1 = pdf_file1.split('/')[-1]
	file2 = pdf_file2.split('/')[-1]
	similarity_score = text_similarity(text1, text2)
	if similarity_score > 0.75:
	st.write(f"Similarity between '{file1}' and '{file2}': {similarity_score}")
	if similarity_score > 0.9:
	st.write(f"Complete plagiarism detected between '{file1}' and '{file2}'!")
	else:
	st.write(f"Potential plagiarism detected between '{file1}' and '{file2}'!")

	def main(folder_or_zip_path):
	"""
	Main function to compare PDF files either in a folder or within a zip file.

	Args:
	folder_or_zip_path (str): Path to the folder containing PDF files or to the zip file.
	"""
	if folder_or_zip_path.endswith('.zip'):
	# Unzip the file
	output_folder = './zip_outputs'
	unzipped_folder = unzip_file(folder_or_zip_path, output_folder)
	folder_path = os.path.join(unzipped_folder, 'pdfs')
	else:
	folder_path = folder_or_zip_path

	# Get all PDF files in the folder
	pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
	num_files = len(pdf_files)

	st.write(f"Found {num_files} PDF files in the folder.")

	if num_files == 0:
	st.write("No PDF files found in the specified folder.")
	return

	# Compare similarity for all pairs of PDF files
	for i in range(num_files):
	for j in range(i+1, num_files):
	compare_pdfs(pdf_files[i], pdf_files[j])

	def unzip_file(zip_file, output_folder):
	"""
	Unzip a zip file to the specified output folder.

	Args:
	zip_file (str): Path to the zip file.
	output_folder (str): Path to the output folder where the contents will be extracted.

	Returns:
	str: Path to the folder containing the extracted files.
	"""
	# Create the output folder if it doesn't exist
	os.makedirs(output_folder, exist_ok=True)

	# Empty the output folder if it already exists
	if os.path.exists(output_folder):
	shutil.rmtree(output_folder)

	# Extract the zip file
	with zipfile.ZipFile(zip_file, 'r') as zip_ref:
	zip_ref.extractall(output_folder)

	return output_folder

	def main():
	st.title("Plagiarism Detector")

	# Custom CSS to apply background color and color scheme
	st.markdown(f"""
	<style>
	body {{
	background-color: {PAGE_BG_COLOR};
	color: {TEXT_COLOR};
	}}
	.stApp {{
	background-color: {PAGE_BG_COLOR};
	}}
	.stContent {{
	background-color: {CONTENT_BG_COLOR};
	}}
	.stBlockContainer {{
	background-color: {CONTENT_BG_COLOR};
	padding: 10px;
	border-radius: 10px;
	}}
	.stButton:focus {{
	background-color: {CONTENT_BG_COLOR};
	}}
	.stButton:hover {{
	background-color: {CONTENT_BG_COLOR};
	}}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("---")

	st.header("Upload Documents or Zip File")

	col1, col2, col3 = st.columns([2, 1, 2])

	with col1:
	st.subheader("Upload Individual PDF Documents")
	file1 = st.file_uploader("Upload first document", type=['pdf'], key='file1')
	file2 = st.file_uploader("Upload second document", type=['pdf'], key='file2')

	with col2:
	st.markdown("<h2 style='text-align: center; color: #0080ff;'>OR</h2>", unsafe_allow_html=True)

	with col3:
	st.subheader("Upload Zip File with PDF Documents")
	zip_file = st.file_uploader("Upload zip file with documents", type=['zip'])

	st.markdown("---")

	plagiarism_button = st.button("Calculate Plagiarism", key='calculate_button', help="Click to check for plagiarism")

	if plagiarism_button:
	if (file1 and file2) or zip_file:
	if file1 and file2:
	text1 = extract_text_from_pdf(file1)
	text2 = extract_text_from_pdf(file2)
	similarity_score = calculate_similarity(text1, text2)
	st.success("Plagiarism Percentage: {}%".format(round(similarity_score * 100, 2)))
	elif zip_file:
	texts = process_zip(zip_file)
	if texts:
	similarity_score = calculate_similarity(texts[0], texts[1])
	st.success("Plagiarism Percentage: {}%".format(round(similarity_score * 100, 2)))
	else:
	st.warning("No .pdf files found in the uploaded zip file or no files uploaded.")
	else:
	st.warning("Please upload at least two PDF documents or one zip file.")

	if __name__ == "__main__":
	main()