KDSH_Task_1 / app.py
delta-praticle's picture
Update app.py
34a0db0 verified
# # import gradio as gr
# # import pdfplumber
# # import networkx as nx
# # import pandas as pd
# # import matplotlib.pyplot as plt
# # import plotly.graph_objects as go
# # from transformers import AutoTokenizer
# # from langchain_core.documents import Document
# # from langchain_experimental.graph_transformers import LLMGraphTransformer
# # from langchain_groq import ChatGroq
# # import os
# # # Initialize components
# # scibert_model = "allenai/scibert_scivocab_uncased"
# # tokenizer = AutoTokenizer.from_pretrained(scibert_model)
# # groq_api_key = "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"
# # llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
# # llm_transformer = LLMGraphTransformer(llm=llm)
# # def extract_text_from_pdf(pdf_path):
# # with pdfplumber.open(pdf_path) as pdf:
# # extracted_text = "".join([page.extract_text() for page in pdf.pages])
# # return extracted_text
# # def scibert_chunking(text, chunk_size=256, max_chunks=6):
# # tokens = tokenizer.tokenize(text)
# # chunks = [
# # tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
# # for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
# # ]
# # return chunks
# # def process_text_with_llm(text):
# # chunks = scibert_chunking(text)
# # documents = [Document(page_content=chunk) for chunk in chunks]
# # graph_documents = [
# # llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
# # ]
# # return graph_documents
# # def build_graph(graph_documents):
# # graph = nx.DiGraph()
# # for graph_doc in graph_documents:
# # for node in graph_doc.nodes:
# # label = node.properties.get("name", node.id)
# # graph.add_node(node.id, label=label)
# # for rel in graph_doc.relationships:
# # graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
# # return graph
# # def calculate_average_top_5_pagerank(graph):
# # pagerank = nx.pagerank(graph)
# # top_5 = sorted(pagerank.values(), reverse=True)[:5]
# # return sum(top_5) / len(top_5) if top_5 else 0, pagerank
# # def draw_static_graph(graph, output_path="graph.png"):
# # plt.figure(figsize=(10, 8))
# # pos = nx.spring_layout(graph, seed=42)
# # nx.draw(
# # graph,
# # pos,
# # with_labels=True,
# # node_size=500,
# # node_color="lightblue",
# # font_size=8,
# # font_weight="bold",
# # edge_color="gray",
# # )
# # plt.title("Static Knowledge Graph")
# # plt.savefig(output_path)
# # plt.close()
# # def generate_interactive_plotly_graph(graph, pagerank):
# # pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
# # edge_x = []
# # edge_y = []
# # for edge in graph.edges():
# # x0, y0 = pos[edge[0]]
# # x1, y1 = pos[edge[1]]
# # edge_x.extend([x0, x1, None])
# # edge_y.extend([y0, y1, None])
# # edge_trace = go.Scatter(
# # x=edge_x,
# # y=edge_y,
# # line=dict(width=0.5, color="#888"),
# # hoverinfo="none",
# # mode="lines",
# # )
# # node_x = []
# # node_y = []
# # node_text = []
# # for node in graph.nodes():
# # x, y = pos[node]
# # node_x.append(x)
# # node_y.append(y)
# # label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
# # pagerank_score = pagerank.get(node, 0)
# # node_text.append(f"{label}<br>{pagerank_score:.4f}")
# # node_trace = go.Scatter(
# # x=node_x,
# # y=node_y,
# # mode="markers+text",
# # text=node_text,
# # hoverinfo="text",
# # marker=dict(
# # showscale=True,
# # colorscale="YlGnBu",
# # size=10,
# # color=list(pagerank.values()),
# # colorbar=dict(
# # thickness=15,
# # title="PageRank",
# # xanchor="left",
# # titleside="right"
# # ),
# # ),
# # )
# # fig = go.Figure(data=[edge_trace, node_trace])
# # fig.update_layout(
# # showlegend=False,
# # hovermode="closest",
# # margin=dict(b=0, l=0, r=0, t=0),
# # xaxis=dict(showgrid=False, zeroline=False),
# # yaxis=dict(showgrid=False, zeroline=False),
# # )
# # return fig
# # def classify_and_visualize_pdf(pdf_path):
# # try:
# # # Step 1: Extract text from the PDF
# # text = extract_text_from_pdf(pdf_path)
# # # Step 2: Process text to generate a knowledge graph
# # graph_documents = process_text_with_llm(text)
# # graph = build_graph(graph_documents)
# # # Step 3: Calculate PageRank and classify
# # avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
# # classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"
# # # Step 4: Draw the static graph and save as image
# # static_graph_path = "knowledge_graph.png"
# # draw_static_graph(graph, static_graph_path)
# # # Step 5: Generate the interactive Plotly graph
# # interactive_fig = generate_interactive_plotly_graph(graph, pagerank)
# # # Step 6: Prepare formatted result
# # result_html = f"""
# # <h3>Classification Result</h3>
# # <p><strong>Classification:</strong> {classification}</p>
# # <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
# # """
# # return result_html, static_graph_path, interactive_fig
# # except Exception as e:
# # return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
# # # Gradio app instance
# # with gr.Blocks() as demo:
# # gr.Markdown(
# # """
# # # πŸ“„ Research Paper Classifier with Knowledge Graphs
# # Upload a PDF research paper, and the app will:
# # 1. **Generate a Static Knowledge Graph**
# # 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details)
# # 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable**
# # """
# # )
# # with gr.Row():
# # pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
# # submit_btn = gr.Button("Classify Paper")
# # with gr.Row():
# # result_output = gr.HTML(label="Classification Result")
# # with gr.Row():
# # static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
# # interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")
# # submit_btn.click(
# # fn=classify_and_visualize_pdf,
# # inputs=pdf_input,
# # outputs=[result_output, static_graph_output, interactive_graph_output],
# # )
# # demo.launch()
# import gradio as gr
# import pdfplumber
# import networkx as nx
# import pandas as pd
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# from transformers import AutoTokenizer
# from langchain_core.documents import Document
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_groq import ChatGroq
# import os
# # Initialize components
# scibert_model = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(scibert_model)
# groq_api_key = "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"
# llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
# llm_transformer = LLMGraphTransformer(llm=llm)
# def extract_text_from_pdf(pdf_path):
# with pdfplumber.open(pdf_path) as pdf:
# extracted_text = "".join([page.extract_text() for page in pdf.pages])
# return extracted_text
# def scibert_chunking(text, chunk_size=256, max_chunks=6):
# tokens = tokenizer.tokenize(text)
# chunks = [
# tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
# for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
# ]
# return chunks
# def process_text_with_llm(text):
# chunks = scibert_chunking(text)
# documents = [Document(page_content=chunk) for chunk in chunks]
# graph_documents = [
# llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
# ]
# return graph_documents
# def build_graph(graph_documents):
# graph = nx.DiGraph()
# for graph_doc in graph_documents:
# for node in graph_doc.nodes:
# label = node.properties.get("name", node.id)
# graph.add_node(node.id, label=label)
# for rel in graph_doc.relationships:
# graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
# return graph
# def calculate_average_top_5_pagerank(graph):
# pagerank = nx.pagerank(graph)
# top_5 = sorted(pagerank.values(), reverse=True)[:5]
# return sum(top_5) / len(top_5) if top_5 else 0, pagerank
# def draw_static_graph(graph, output_path="graph.png"):
# plt.figure(figsize=(10, 8))
# pos = nx.spring_layout(graph, seed=42)
# nx.draw(
# graph,
# pos,
# with_labels=True,
# node_size=500,
# node_color="lightblue",
# font_size=8,
# font_weight="bold",
# edge_color="gray",
# )
# plt.title("Static Knowledge Graph")
# plt.savefig(output_path)
# plt.close()
# def generate_interactive_plotly_graph(graph, pagerank):
# pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
# edge_x = []
# edge_y = []
# for edge in graph.edges():
# x0, y0 = pos[edge[0]]
# x1, y1 = pos[edge[1]]
# edge_x.extend([x0, x1, None])
# edge_y.extend([y0, y1, None])
# edge_trace = go.Scatter(
# x=edge_x,
# y=edge_y,
# line=dict(width=0.5, color="#888"),
# hoverinfo="none",
# mode="lines",
# )
# node_x = []
# node_y = []
# node_text = []
# for node in graph.nodes():
# x, y = pos[node]
# node_x.append(x)
# node_y.append(y)
# label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
# pagerank_score = pagerank.get(node, 0)
# node_text.append(f"{label}<br>{pagerank_score:.4f}")
# node_trace = go.Scatter(
# x=node_x,
# y=node_y,
# mode="markers+text",
# text=node_text,
# hoverinfo="text",
# marker=dict(
# showscale=True,
# colorscale="YlGnBu",
# size=10,
# color=list(pagerank.values()),
# colorbar=dict(
# thickness=15,
# title="PageRank",
# xanchor="left",
# titleside="right"
# ),
# ),
# )
# fig = go.Figure(data=[edge_trace, node_trace])
# fig.update_layout(
# showlegend=False,
# hovermode="closest",
# margin=dict(b=0, l=0, r=0, t=0),
# xaxis=dict(showgrid=False, zeroline=False),
# yaxis=dict(showgrid=False, zeroline=False),
# )
# return fig
# def classify_and_visualize_pdf(pdf_path):
# try:
# # Step 1: Extract text from the PDF
# text = extract_text_from_pdf(pdf_path)
# # Step 2: Process text to generate a knowledge graph
# graph_documents = process_text_with_llm(text)
# graph = build_graph(graph_documents)
# # Step 3: Calculate PageRank and classify
# avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
# classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"
# # Step 4: Draw the static graph and save as image
# static_graph_path = "knowledge_graph.png"
# draw_static_graph(graph, static_graph_path)
# # Step 5: Generate the interactive Plotly graph
# interactive_fig = generate_interactive_plotly_graph(graph, pagerank)
# # Step 6: Prepare formatted result
# result_html = f"""
# <h3>Classification Result</h3>
# <p><strong>Classification:</strong> {classification}</p>
# <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
# """
# return result_html, static_graph_path, interactive_fig
# except Exception as e:
# return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
# # Gradio app instance
# with gr.Blocks(css="""
# body {background: linear-gradient(to right, #6A11CB, #2575FC); color: white;}
# .gr-button {background: #34A853; color: white; border-radius: 8px;}
# .gr-button:hover {background: #2F8A43;}
# .gr-markdown {font-family: 'Roboto', sans-serif; text-align: center;}
# .gr-file-upload {border: 2px dashed #fff;}
# .gr-row {padding: 10px; justify-content: center; align-items: center;}
# """) as demo:
# gr.Markdown(
# """
# # πŸ“„ Research Paper Classifier with Knowledge Graphs
# Upload a PDF research paper, and the app will:
# 1. **Generate a Static Knowledge Graph**
# 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details)
# 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable**
# """
# )
# with gr.Row():
# pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
# submit_btn = gr.Button("Classify Paper", elem_id="submit")
# with gr.Row():
# result_output = gr.HTML(label="Classification Result")
# with gr.Row():
# static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
# interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")
# submit_btn.click(
# fn=classify_and_visualize_pdf,
# inputs=pdf_input,
# outputs=[result_output, static_graph_output, interactive_graph_output],
# )
# demo.launch()
import gradio as gr
import pdfplumber
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from transformers import AutoTokenizer
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_groq import ChatGroq
import os
# Initialize components
scibert_model = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(scibert_model)
groq_api_key = "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"
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
llm_transformer = LLMGraphTransformer(llm=llm)
def extract_text_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
extracted_text = "".join([page.extract_text() for page in pdf.pages])
return extracted_text
def scibert_chunking(text, chunk_size=256, max_chunks=6):
tokens = tokenizer.tokenize(text)
chunks = [
tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
]
return chunks
def process_text_with_llm(text):
chunks = scibert_chunking(text)
documents = [Document(page_content=chunk) for chunk in chunks]
graph_documents = [
llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
]
return graph_documents
def build_graph(graph_documents):
graph = nx.DiGraph()
for graph_doc in graph_documents:
for node in graph_doc.nodes:
label = node.properties.get("name", node.id)
graph.add_node(node.id, label=label)
for rel in graph_doc.relationships:
graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
return graph
def calculate_average_top_5_pagerank(graph):
pagerank = nx.pagerank(graph)
top_5 = sorted(pagerank.values(), reverse=True)[:5]
return sum(top_5) / len(top_5) if top_5 else 0, pagerank
def draw_static_graph(graph, output_path="graph.png"):
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(graph, seed=42)
nx.draw(
graph,
pos,
with_labels=True,
node_size=500,
node_color="lightblue",
font_size=8,
font_weight="bold",
edge_color="gray",
)
plt.title("Static Knowledge Graph")
plt.savefig(output_path)
plt.close()
def generate_interactive_plotly_graph(graph, pagerank):
pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
edge_x = []
edge_y = []
for edge in graph.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x,
y=edge_y,
line=dict(width=0.5, color="#888"),
hoverinfo="none",
mode="lines",
)
node_x = []
node_y = []
node_text = []
for node in graph.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
pagerank_score = pagerank.get(node, 0)
node_text.append(f"{label}<br>{pagerank_score:.4f}")
node_trace = go.Scatter(
x=node_x,
y=node_y,
mode="markers+text",
text=node_text,
hoverinfo="text",
marker=dict(
showscale=True,
colorscale="YlGnBu",
size=10,
color=list(pagerank.values()),
colorbar=dict(
thickness=15,
title="PageRank",
xanchor="left",
titleside="right"
),
),
)
fig = go.Figure(data=[edge_trace, node_trace])
fig.update_layout(
showlegend=False,
hovermode="closest",
margin=dict(b=0, l=0, r=0, t=0),
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=False, zeroline=False),
)
return fig
def classify_and_visualize_pdf(pdf_path):
try:
# Step 1: Extract text from the PDF
text = extract_text_from_pdf(pdf_path)
# Step 2: Process text to generate a knowledge graph
graph_documents = process_text_with_llm(text)
graph = build_graph(graph_documents)
# Step 3: Calculate PageRank and classify
avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"
# Step 4: Draw the static graph and save as image
static_graph_path = "knowledge_graph.png"
draw_static_graph(graph, static_graph_path)
# Step 5: Generate the interactive Plotly graph
interactive_fig = generate_interactive_plotly_graph(graph, pagerank)
# Step 6: Prepare formatted result
result_html = f"""
<h3>Classification Result</h3>
<p><strong>Classification:</strong> {classification}</p>
<p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
"""
return result_html, static_graph_path, interactive_fig
except Exception as e:
return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
# Gradio app instance
with gr.Blocks(css="""
body {
background: linear-gradient(to right, #6A11CB, #2575FC);
color: white;
font-family: 'Poppins', sans-serif;
}
.gr-button {
background: #34A853;
color: white;
border-radius: 8px;
padding: 10px 20px;
font-size: 16px;
transition: background 0.3s ease;
}
.gr-button:hover {
background: #2F8A43;
transform: scale(1.05);
}
.gr-markdown {
font-family: 'Poppins', sans-serif;
text-align: center;
font-size: 18px;
padding: 10px;
background: rgba(255, 255, 255, 0.2);
border-radius: 10px;
}
.gr-file-upload {
border: 2px dashed #fff;
padding: 20px;
border-radius: 10px;
transition: border-color 0.3s ease;
}
.gr-file-upload:hover {
border-color: #34A853;
}
.gr-row {
padding: 10px;
justify-content: center;
align-items: center;
}
""") as demo:
gr.Markdown(
"""
# πŸ“„ Research Paper Classifier with Knowledge Graphs
Upload a PDF research paper, and the app will:
1. **Generate a Static Knowledge Graph**
2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details)
3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable**
"""
)
with gr.Row():
pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
submit_btn = gr.Button("Classify Paper", elem_id="submit")
with gr.Row():
result_output = gr.HTML(label="Classification Result")
with gr.Row():
static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")
submit_btn.click(
fn=classify_and_visualize_pdf,
inputs=pdf_input,
outputs=[result_output, static_graph_output, interactive_graph_output],
)
demo.launch()