Spaces:

delta-praticle
/

KDSH_Task_1

Sleeping

App Files Files Community

KDSH_Task_1 / app.py

delta-praticle

Update app.py

34a0db0 verified 11 months ago

raw

history blame contribute delete

21.7 kB

	# # import gradio as gr
	# # import pdfplumber
	# # import networkx as nx
	# # import pandas as pd
	# # import matplotlib.pyplot as plt
	# # import plotly.graph_objects as go
	# # from transformers import AutoTokenizer
	# # from langchain_core.documents import Document
	# # from langchain_experimental.graph_transformers import LLMGraphTransformer
	# # from langchain_groq import ChatGroq
	# # import os

	# # # Initialize components
	# # scibert_model = "allenai/scibert_scivocab_uncased"
	# # tokenizer = AutoTokenizer.from_pretrained(scibert_model)
	# # groq_api_key = "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"
	# # llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
	# # llm_transformer = LLMGraphTransformer(llm=llm)


	# # def extract_text_from_pdf(pdf_path):
	# # with pdfplumber.open(pdf_path) as pdf:
	# # extracted_text = "".join([page.extract_text() for page in pdf.pages])
	# # return extracted_text


	# # def scibert_chunking(text, chunk_size=256, max_chunks=6):
	# # tokens = tokenizer.tokenize(text)
	# # chunks = [
	# # tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
	# # for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
	# # ]
	# # return chunks


	# # def process_text_with_llm(text):
	# # chunks = scibert_chunking(text)
	# # documents = [Document(page_content=chunk) for chunk in chunks]
	# # graph_documents = [
	# # llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
	# # ]
	# # return graph_documents


	# # def build_graph(graph_documents):
	# # graph = nx.DiGraph()
	# # for graph_doc in graph_documents:
	# # for node in graph_doc.nodes:
	# # label = node.properties.get("name", node.id)
	# # graph.add_node(node.id, label=label)
	# # for rel in graph_doc.relationships:
	# # graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
	# # return graph


	# # def calculate_average_top_5_pagerank(graph):
	# # pagerank = nx.pagerank(graph)
	# # top_5 = sorted(pagerank.values(), reverse=True)[:5]
	# # return sum(top_5) / len(top_5) if top_5 else 0, pagerank


	# # def draw_static_graph(graph, output_path="graph.png"):
	# # plt.figure(figsize=(10, 8))
	# # pos = nx.spring_layout(graph, seed=42)
	# # nx.draw(
	# # graph,
	# # pos,
	# # with_labels=True,
	# # node_size=500,
	# # node_color="lightblue",
	# # font_size=8,
	# # font_weight="bold",
	# # edge_color="gray",
	# # )
	# # plt.title("Static Knowledge Graph")
	# # plt.savefig(output_path)
	# # plt.close()


	# # def generate_interactive_plotly_graph(graph, pagerank):
	# # pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes

	# # edge_x = []
	# # edge_y = []
	# # for edge in graph.edges():
	# # x0, y0 = pos[edge[0]]
	# # x1, y1 = pos[edge[1]]
	# # edge_x.extend([x0, x1, None])
	# # edge_y.extend([y0, y1, None])

	# # edge_trace = go.Scatter(
	# # x=edge_x,
	# # y=edge_y,
	# # line=dict(width=0.5, color="#888"),
	# # hoverinfo="none",
	# # mode="lines",
	# # )

	# # node_x = []
	# # node_y = []
	# # node_text = []
	# # for node in graph.nodes():
	# # x, y = pos[node]
	# # node_x.append(x)
	# # node_y.append(y)

	# # label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
	# # pagerank_score = pagerank.get(node, 0)
	# # node_text.append(f"{label}<br>{pagerank_score:.4f}")

	# # node_trace = go.Scatter(
	# # x=node_x,
	# # y=node_y,
	# # mode="markers+text",
	# # text=node_text,
	# # hoverinfo="text",
	# # marker=dict(
	# # showscale=True,
	# # colorscale="YlGnBu",
	# # size=10,
	# # color=list(pagerank.values()),
	# # colorbar=dict(
	# # thickness=15,
	# # title="PageRank",
	# # xanchor="left",
	# # titleside="right"
	# # ),
	# # ),
	# # )

	# # fig = go.Figure(data=[edge_trace, node_trace])
	# # fig.update_layout(
	# # showlegend=False,
	# # hovermode="closest",
	# # margin=dict(b=0, l=0, r=0, t=0),
	# # xaxis=dict(showgrid=False, zeroline=False),
	# # yaxis=dict(showgrid=False, zeroline=False),
	# # )

	# # return fig


	# # def classify_and_visualize_pdf(pdf_path):
	# # try:
	# # # Step 1: Extract text from the PDF
	# # text = extract_text_from_pdf(pdf_path)

	# # # Step 2: Process text to generate a knowledge graph
	# # graph_documents = process_text_with_llm(text)
	# # graph = build_graph(graph_documents)

	# # # Step 3: Calculate PageRank and classify
	# # avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
	# # classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"

	# # # Step 4: Draw the static graph and save as image
	# # static_graph_path = "knowledge_graph.png"
	# # draw_static_graph(graph, static_graph_path)

	# # # Step 5: Generate the interactive Plotly graph
	# # interactive_fig = generate_interactive_plotly_graph(graph, pagerank)

	# # # Step 6: Prepare formatted result
	# # result_html = f"""
	# # <h3>Classification Result</h3>
	# # <p><strong>Classification:</strong> {classification}</p>
	# # <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
	# # """

	# # return result_html, static_graph_path, interactive_fig
	# # except Exception as e:
	# # return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None


	# # # Gradio app instance
	# # with gr.Blocks() as demo:
	# # gr.Markdown(
	# # """
	# # # 📄 Research Paper Classifier with Knowledge Graphs
	# # Upload a PDF research paper, and the app will:
	# # 1. Generate a Static Knowledge Graph
	# # 2. Generate an Interactive Knowledge Graph (hover over nodes to see details)
	# # 3. Calculate PageRank and classify the paper as Publishable or Non-Publishable
	# # """
	# # )

	# # with gr.Row():
	# # pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
	# # submit_btn = gr.Button("Classify Paper")

	# # with gr.Row():
	# # result_output = gr.HTML(label="Classification Result")

	# # with gr.Row():
	# # static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
	# # interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")

	# # submit_btn.click(
	# # fn=classify_and_visualize_pdf,
	# # inputs=pdf_input,
	# # outputs=[result_output, static_graph_output, interactive_graph_output],
	# # )

	# # demo.launch()
	# import gradio as gr
	# import pdfplumber
	# import networkx as nx
	# import pandas as pd
	# import matplotlib.pyplot as plt
	# import plotly.graph_objects as go
	# from transformers import AutoTokenizer
	# from langchain_core.documents import Document
	# from langchain_experimental.graph_transformers import LLMGraphTransformer
	# from langchain_groq import ChatGroq
	# import os

	# # Initialize components
	# scibert_model = "allenai/scibert_scivocab_uncased"
	# tokenizer = AutoTokenizer.from_pretrained(scibert_model)
	# groq_api_key = "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"
	# llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
	# llm_transformer = LLMGraphTransformer(llm=llm)

	# def extract_text_from_pdf(pdf_path):
	# with pdfplumber.open(pdf_path) as pdf:
	# extracted_text = "".join([page.extract_text() for page in pdf.pages])
	# return extracted_text

	# def scibert_chunking(text, chunk_size=256, max_chunks=6):
	# tokens = tokenizer.tokenize(text)
	# chunks = [
	# tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
	# for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
	# ]
	# return chunks

	# def process_text_with_llm(text):
	# chunks = scibert_chunking(text)
	# documents = [Document(page_content=chunk) for chunk in chunks]
	# graph_documents = [
	# llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
	# ]
	# return graph_documents

	# def build_graph(graph_documents):
	# graph = nx.DiGraph()
	# for graph_doc in graph_documents:
	# for node in graph_doc.nodes:
	# label = node.properties.get("name", node.id)
	# graph.add_node(node.id, label=label)
	# for rel in graph_doc.relationships:
	# graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
	# return graph

	# def calculate_average_top_5_pagerank(graph):
	# pagerank = nx.pagerank(graph)
	# top_5 = sorted(pagerank.values(), reverse=True)[:5]
	# return sum(top_5) / len(top_5) if top_5 else 0, pagerank

	# def draw_static_graph(graph, output_path="graph.png"):
	# plt.figure(figsize=(10, 8))
	# pos = nx.spring_layout(graph, seed=42)
	# nx.draw(
	# graph,
	# pos,
	# with_labels=True,
	# node_size=500,
	# node_color="lightblue",
	# font_size=8,
	# font_weight="bold",
	# edge_color="gray",
	# )
	# plt.title("Static Knowledge Graph")
	# plt.savefig(output_path)
	# plt.close()

	# def generate_interactive_plotly_graph(graph, pagerank):
	# pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes

	# edge_x = []
	# edge_y = []
	# for edge in graph.edges():
	# x0, y0 = pos[edge[0]]
	# x1, y1 = pos[edge[1]]
	# edge_x.extend([x0, x1, None])
	# edge_y.extend([y0, y1, None])

	# edge_trace = go.Scatter(
	# x=edge_x,
	# y=edge_y,
	# line=dict(width=0.5, color="#888"),
	# hoverinfo="none",
	# mode="lines",
	# )

	# node_x = []
	# node_y = []
	# node_text = []
	# for node in graph.nodes():
	# x, y = pos[node]
	# node_x.append(x)
	# node_y.append(y)

	# label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
	# pagerank_score = pagerank.get(node, 0)
	# node_text.append(f"{label}<br>{pagerank_score:.4f}")

	# node_trace = go.Scatter(
	# x=node_x,
	# y=node_y,
	# mode="markers+text",
	# text=node_text,
	# hoverinfo="text",
	# marker=dict(
	# showscale=True,
	# colorscale="YlGnBu",
	# size=10,
	# color=list(pagerank.values()),
	# colorbar=dict(
	# thickness=15,
	# title="PageRank",
	# xanchor="left",
	# titleside="right"
	# ),
	# ),
	# )

	# fig = go.Figure(data=[edge_trace, node_trace])
	# fig.update_layout(
	# showlegend=False,
	# hovermode="closest",
	# margin=dict(b=0, l=0, r=0, t=0),
	# xaxis=dict(showgrid=False, zeroline=False),
	# yaxis=dict(showgrid=False, zeroline=False),
	# )

	# return fig

	# def classify_and_visualize_pdf(pdf_path):
	# try:
	# # Step 1: Extract text from the PDF
	# text = extract_text_from_pdf(pdf_path)

	# # Step 2: Process text to generate a knowledge graph
	# graph_documents = process_text_with_llm(text)
	# graph = build_graph(graph_documents)

	# # Step 3: Calculate PageRank and classify
	# avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
	# classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"

	# # Step 4: Draw the static graph and save as image
	# static_graph_path = "knowledge_graph.png"
	# draw_static_graph(graph, static_graph_path)

	# # Step 5: Generate the interactive Plotly graph
	# interactive_fig = generate_interactive_plotly_graph(graph, pagerank)

	# # Step 6: Prepare formatted result
	# result_html = f"""
	# <h3>Classification Result</h3>
	# <p><strong>Classification:</strong> {classification}</p>
	# <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
	# """

	# return result_html, static_graph_path, interactive_fig
	# except Exception as e:
	# return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None

	# # Gradio app instance
	# with gr.Blocks(css="""
	# body {background: linear-gradient(to right, #6A11CB, #2575FC); color: white;}
	# .gr-button {background: #34A853; color: white; border-radius: 8px;}
	# .gr-button:hover {background: #2F8A43;}
	# .gr-markdown {font-family: 'Roboto', sans-serif; text-align: center;}
	# .gr-file-upload {border: 2px dashed #fff;}
	# .gr-row {padding: 10px; justify-content: center; align-items: center;}
	# """) as demo:
	# gr.Markdown(
	# """
	# # 📄 Research Paper Classifier with Knowledge Graphs
	# Upload a PDF research paper, and the app will:
	# 1. Generate a Static Knowledge Graph
	# 2. Generate an Interactive Knowledge Graph (hover over nodes to see details)
	# 3. Calculate PageRank and classify the paper as Publishable or Non-Publishable
	# """
	# )

	# with gr.Row():
	# pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
	# submit_btn = gr.Button("Classify Paper", elem_id="submit")

	# with gr.Row():
	# result_output = gr.HTML(label="Classification Result")

	# with gr.Row():
	# static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
	# interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")

	# submit_btn.click(
	# fn=classify_and_visualize_pdf,
	# inputs=pdf_input,
	# outputs=[result_output, static_graph_output, interactive_graph_output],
	# )

	# demo.launch()
	import gradio as gr
	import pdfplumber
	import networkx as nx
	import pandas as pd
	import matplotlib.pyplot as plt
	import plotly.graph_objects as go
	from transformers import AutoTokenizer
	from langchain_core.documents import Document
	from langchain_experimental.graph_transformers import LLMGraphTransformer
	from langchain_groq import ChatGroq
	import os

	# Initialize components
	scibert_model = "allenai/scibert_scivocab_uncased"
	tokenizer = AutoTokenizer.from_pretrained(scibert_model)
	groq_api_key = "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"
	llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
	llm_transformer = LLMGraphTransformer(llm=llm)

	def extract_text_from_pdf(pdf_path):
	with pdfplumber.open(pdf_path) as pdf:
	extracted_text = "".join([page.extract_text() for page in pdf.pages])
	return extracted_text

	def scibert_chunking(text, chunk_size=256, max_chunks=6):
	tokens = tokenizer.tokenize(text)
	chunks = [
	tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
	for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
	]
	return chunks

	def process_text_with_llm(text):
	chunks = scibert_chunking(text)
	documents = [Document(page_content=chunk) for chunk in chunks]
	graph_documents = [
	llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
	]
	return graph_documents

	def build_graph(graph_documents):
	graph = nx.DiGraph()
	for graph_doc in graph_documents:
	for node in graph_doc.nodes:
	label = node.properties.get("name", node.id)
	graph.add_node(node.id, label=label)
	for rel in graph_doc.relationships:
	graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
	return graph

	def calculate_average_top_5_pagerank(graph):
	pagerank = nx.pagerank(graph)
	top_5 = sorted(pagerank.values(), reverse=True)[:5]
	return sum(top_5) / len(top_5) if top_5 else 0, pagerank

	def draw_static_graph(graph, output_path="graph.png"):
	plt.figure(figsize=(10, 8))
	pos = nx.spring_layout(graph, seed=42)
	nx.draw(
	graph,
	pos,
	with_labels=True,
	node_size=500,
	node_color="lightblue",
	font_size=8,
	font_weight="bold",
	edge_color="gray",
	)
	plt.title("Static Knowledge Graph")
	plt.savefig(output_path)
	plt.close()

	def generate_interactive_plotly_graph(graph, pagerank):
	pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes

	edge_x = []
	edge_y = []
	for edge in graph.edges():
	x0, y0 = pos[edge[0]]
	x1, y1 = pos[edge[1]]
	edge_x.extend([x0, x1, None])
	edge_y.extend([y0, y1, None])

	edge_trace = go.Scatter(
	x=edge_x,
	y=edge_y,
	line=dict(width=0.5, color="#888"),
	hoverinfo="none",
	mode="lines",
	)

	node_x = []
	node_y = []
	node_text = []
	for node in graph.nodes():
	x, y = pos[node]
	node_x.append(x)
	node_y.append(y)

	label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
	pagerank_score = pagerank.get(node, 0)
	node_text.append(f"{label}<br>{pagerank_score:.4f}")

	node_trace = go.Scatter(
	x=node_x,
	y=node_y,
	mode="markers+text",
	text=node_text,
	hoverinfo="text",
	marker=dict(
	showscale=True,
	colorscale="YlGnBu",
	size=10,
	color=list(pagerank.values()),
	colorbar=dict(
	thickness=15,
	title="PageRank",
	xanchor="left",
	titleside="right"
	),
	),
	)

	fig = go.Figure(data=[edge_trace, node_trace])
	fig.update_layout(
	showlegend=False,
	hovermode="closest",
	margin=dict(b=0, l=0, r=0, t=0),
	xaxis=dict(showgrid=False, zeroline=False),
	yaxis=dict(showgrid=False, zeroline=False),
	)

	return fig

	def classify_and_visualize_pdf(pdf_path):
	try:
	# Step 1: Extract text from the PDF
	text = extract_text_from_pdf(pdf_path)

	# Step 2: Process text to generate a knowledge graph
	graph_documents = process_text_with_llm(text)
	graph = build_graph(graph_documents)

	# Step 3: Calculate PageRank and classify
	avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
	classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"

	# Step 4: Draw the static graph and save as image
	static_graph_path = "knowledge_graph.png"
	draw_static_graph(graph, static_graph_path)

	# Step 5: Generate the interactive Plotly graph
	interactive_fig = generate_interactive_plotly_graph(graph, pagerank)

	# Step 6: Prepare formatted result
	result_html = f"""
	<h3>Classification Result</h3>
	<p><strong>Classification:</strong> {classification}</p>
	<p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
	"""

	return result_html, static_graph_path, interactive_fig
	except Exception as e:
	return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None

	# Gradio app instance
	with gr.Blocks(css="""
	body {
	background: linear-gradient(to right, #6A11CB, #2575FC);
	color: white;
	font-family: 'Poppins', sans-serif;
	}
	.gr-button {
	background: #34A853;
	color: white;
	border-radius: 8px;
	padding: 10px 20px;
	font-size: 16px;
	transition: background 0.3s ease;
	}
	.gr-button:hover {
	background: #2F8A43;
	transform: scale(1.05);
	}
	.gr-markdown {
	font-family: 'Poppins', sans-serif;
	text-align: center;
	font-size: 18px;
	padding: 10px;
	background: rgba(255, 255, 255, 0.2);
	border-radius: 10px;
	}
	.gr-file-upload {
	border: 2px dashed #fff;
	padding: 20px;
	border-radius: 10px;
	transition: border-color 0.3s ease;
	}
	.gr-file-upload:hover {
	border-color: #34A853;
	}
	.gr-row {
	padding: 10px;
	justify-content: center;
	align-items: center;
	}
	""") as demo:
	gr.Markdown(
	"""
	# 📄 Research Paper Classifier with Knowledge Graphs
	Upload a PDF research paper, and the app will:
	1. Generate a Static Knowledge Graph
	2. Generate an Interactive Knowledge Graph (hover over nodes to see details)
	3. Calculate PageRank and classify the paper as Publishable or Non-Publishable
	"""
	)

	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
	submit_btn = gr.Button("Classify Paper", elem_id="submit")

	with gr.Row():
	result_output = gr.HTML(label="Classification Result")

	with gr.Row():
	static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
	interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")

	submit_btn.click(
	fn=classify_and_visualize_pdf,
	inputs=pdf_input,
	outputs=[result_output, static_graph_output, interactive_graph_output],
	)

	demo.launch()