Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pdfplumber
|
3 |
import networkx as nx
|
@@ -17,13 +221,11 @@ groq_api_key = "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"
|
|
17 |
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
18 |
llm_transformer = LLMGraphTransformer(llm=llm)
|
19 |
|
20 |
-
|
21 |
def extract_text_from_pdf(pdf_path):
|
22 |
with pdfplumber.open(pdf_path) as pdf:
|
23 |
extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
24 |
return extracted_text
|
25 |
|
26 |
-
|
27 |
def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
28 |
tokens = tokenizer.tokenize(text)
|
29 |
chunks = [
|
@@ -32,7 +234,6 @@ def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
|
32 |
]
|
33 |
return chunks
|
34 |
|
35 |
-
|
36 |
def process_text_with_llm(text):
|
37 |
chunks = scibert_chunking(text)
|
38 |
documents = [Document(page_content=chunk) for chunk in chunks]
|
@@ -41,7 +242,6 @@ def process_text_with_llm(text):
|
|
41 |
]
|
42 |
return graph_documents
|
43 |
|
44 |
-
|
45 |
def build_graph(graph_documents):
|
46 |
graph = nx.DiGraph()
|
47 |
for graph_doc in graph_documents:
|
@@ -52,13 +252,11 @@ def build_graph(graph_documents):
|
|
52 |
graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
53 |
return graph
|
54 |
|
55 |
-
|
56 |
def calculate_average_top_5_pagerank(graph):
|
57 |
pagerank = nx.pagerank(graph)
|
58 |
top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
59 |
return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
60 |
|
61 |
-
|
62 |
def draw_static_graph(graph, output_path="graph.png"):
|
63 |
plt.figure(figsize=(10, 8))
|
64 |
pos = nx.spring_layout(graph, seed=42)
|
@@ -76,7 +274,6 @@ def draw_static_graph(graph, output_path="graph.png"):
|
|
76 |
plt.savefig(output_path)
|
77 |
plt.close()
|
78 |
|
79 |
-
|
80 |
def generate_interactive_plotly_graph(graph, pagerank):
|
81 |
pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
82 |
|
@@ -139,7 +336,6 @@ def generate_interactive_plotly_graph(graph, pagerank):
|
|
139 |
|
140 |
return fig
|
141 |
|
142 |
-
|
143 |
def classify_and_visualize_pdf(pdf_path):
|
144 |
try:
|
145 |
# Step 1: Extract text from the PDF
|
@@ -171,9 +367,15 @@ def classify_and_visualize_pdf(pdf_path):
|
|
171 |
except Exception as e:
|
172 |
return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
173 |
|
174 |
-
|
175 |
# Gradio app instance
|
176 |
-
with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
gr.Markdown(
|
178 |
"""
|
179 |
# π Research Paper Classifier with Knowledge Graphs
|
@@ -185,8 +387,8 @@ with gr.Blocks() as demo:
|
|
185 |
)
|
186 |
|
187 |
with gr.Row():
|
188 |
-
pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
|
189 |
-
submit_btn = gr.Button("Classify Paper")
|
190 |
|
191 |
with gr.Row():
|
192 |
result_output = gr.HTML(label="Classification Result")
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
# import pdfplumber
|
3 |
+
# import networkx as nx
|
4 |
+
# import pandas as pd
|
5 |
+
# import matplotlib.pyplot as plt
|
6 |
+
# import plotly.graph_objects as go
|
7 |
+
# from transformers import AutoTokenizer
|
8 |
+
# from langchain_core.documents import Document
|
9 |
+
# from langchain_experimental.graph_transformers import LLMGraphTransformer
|
10 |
+
# from langchain_groq import ChatGroq
|
11 |
+
# import os
|
12 |
+
|
13 |
+
# # Initialize components
|
14 |
+
# scibert_model = "allenai/scibert_scivocab_uncased"
|
15 |
+
# tokenizer = AutoTokenizer.from_pretrained(scibert_model)
|
16 |
+
# groq_api_key = "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"
|
17 |
+
# llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
18 |
+
# llm_transformer = LLMGraphTransformer(llm=llm)
|
19 |
+
|
20 |
+
|
21 |
+
# def extract_text_from_pdf(pdf_path):
|
22 |
+
# with pdfplumber.open(pdf_path) as pdf:
|
23 |
+
# extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
24 |
+
# return extracted_text
|
25 |
+
|
26 |
+
|
27 |
+
# def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
28 |
+
# tokens = tokenizer.tokenize(text)
|
29 |
+
# chunks = [
|
30 |
+
# tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
|
31 |
+
# for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
|
32 |
+
# ]
|
33 |
+
# return chunks
|
34 |
+
|
35 |
+
|
36 |
+
# def process_text_with_llm(text):
|
37 |
+
# chunks = scibert_chunking(text)
|
38 |
+
# documents = [Document(page_content=chunk) for chunk in chunks]
|
39 |
+
# graph_documents = [
|
40 |
+
# llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
|
41 |
+
# ]
|
42 |
+
# return graph_documents
|
43 |
+
|
44 |
+
|
45 |
+
# def build_graph(graph_documents):
|
46 |
+
# graph = nx.DiGraph()
|
47 |
+
# for graph_doc in graph_documents:
|
48 |
+
# for node in graph_doc.nodes:
|
49 |
+
# label = node.properties.get("name", node.id)
|
50 |
+
# graph.add_node(node.id, label=label)
|
51 |
+
# for rel in graph_doc.relationships:
|
52 |
+
# graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
53 |
+
# return graph
|
54 |
+
|
55 |
+
|
56 |
+
# def calculate_average_top_5_pagerank(graph):
|
57 |
+
# pagerank = nx.pagerank(graph)
|
58 |
+
# top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
59 |
+
# return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
60 |
+
|
61 |
+
|
62 |
+
# def draw_static_graph(graph, output_path="graph.png"):
|
63 |
+
# plt.figure(figsize=(10, 8))
|
64 |
+
# pos = nx.spring_layout(graph, seed=42)
|
65 |
+
# nx.draw(
|
66 |
+
# graph,
|
67 |
+
# pos,
|
68 |
+
# with_labels=True,
|
69 |
+
# node_size=500,
|
70 |
+
# node_color="lightblue",
|
71 |
+
# font_size=8,
|
72 |
+
# font_weight="bold",
|
73 |
+
# edge_color="gray",
|
74 |
+
# )
|
75 |
+
# plt.title("Static Knowledge Graph")
|
76 |
+
# plt.savefig(output_path)
|
77 |
+
# plt.close()
|
78 |
+
|
79 |
+
|
80 |
+
# def generate_interactive_plotly_graph(graph, pagerank):
|
81 |
+
# pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
82 |
+
|
83 |
+
# edge_x = []
|
84 |
+
# edge_y = []
|
85 |
+
# for edge in graph.edges():
|
86 |
+
# x0, y0 = pos[edge[0]]
|
87 |
+
# x1, y1 = pos[edge[1]]
|
88 |
+
# edge_x.extend([x0, x1, None])
|
89 |
+
# edge_y.extend([y0, y1, None])
|
90 |
+
|
91 |
+
# edge_trace = go.Scatter(
|
92 |
+
# x=edge_x,
|
93 |
+
# y=edge_y,
|
94 |
+
# line=dict(width=0.5, color="#888"),
|
95 |
+
# hoverinfo="none",
|
96 |
+
# mode="lines",
|
97 |
+
# )
|
98 |
+
|
99 |
+
# node_x = []
|
100 |
+
# node_y = []
|
101 |
+
# node_text = []
|
102 |
+
# for node in graph.nodes():
|
103 |
+
# x, y = pos[node]
|
104 |
+
# node_x.append(x)
|
105 |
+
# node_y.append(y)
|
106 |
+
|
107 |
+
# label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
|
108 |
+
# pagerank_score = pagerank.get(node, 0)
|
109 |
+
# node_text.append(f"{label}<br>{pagerank_score:.4f}")
|
110 |
+
|
111 |
+
# node_trace = go.Scatter(
|
112 |
+
# x=node_x,
|
113 |
+
# y=node_y,
|
114 |
+
# mode="markers+text",
|
115 |
+
# text=node_text,
|
116 |
+
# hoverinfo="text",
|
117 |
+
# marker=dict(
|
118 |
+
# showscale=True,
|
119 |
+
# colorscale="YlGnBu",
|
120 |
+
# size=10,
|
121 |
+
# color=list(pagerank.values()),
|
122 |
+
# colorbar=dict(
|
123 |
+
# thickness=15,
|
124 |
+
# title="PageRank",
|
125 |
+
# xanchor="left",
|
126 |
+
# titleside="right"
|
127 |
+
# ),
|
128 |
+
# ),
|
129 |
+
# )
|
130 |
+
|
131 |
+
# fig = go.Figure(data=[edge_trace, node_trace])
|
132 |
+
# fig.update_layout(
|
133 |
+
# showlegend=False,
|
134 |
+
# hovermode="closest",
|
135 |
+
# margin=dict(b=0, l=0, r=0, t=0),
|
136 |
+
# xaxis=dict(showgrid=False, zeroline=False),
|
137 |
+
# yaxis=dict(showgrid=False, zeroline=False),
|
138 |
+
# )
|
139 |
+
|
140 |
+
# return fig
|
141 |
+
|
142 |
+
|
143 |
+
# def classify_and_visualize_pdf(pdf_path):
|
144 |
+
# try:
|
145 |
+
# # Step 1: Extract text from the PDF
|
146 |
+
# text = extract_text_from_pdf(pdf_path)
|
147 |
+
|
148 |
+
# # Step 2: Process text to generate a knowledge graph
|
149 |
+
# graph_documents = process_text_with_llm(text)
|
150 |
+
# graph = build_graph(graph_documents)
|
151 |
+
|
152 |
+
# # Step 3: Calculate PageRank and classify
|
153 |
+
# avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
|
154 |
+
# classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"
|
155 |
+
|
156 |
+
# # Step 4: Draw the static graph and save as image
|
157 |
+
# static_graph_path = "knowledge_graph.png"
|
158 |
+
# draw_static_graph(graph, static_graph_path)
|
159 |
+
|
160 |
+
# # Step 5: Generate the interactive Plotly graph
|
161 |
+
# interactive_fig = generate_interactive_plotly_graph(graph, pagerank)
|
162 |
+
|
163 |
+
# # Step 6: Prepare formatted result
|
164 |
+
# result_html = f"""
|
165 |
+
# <h3>Classification Result</h3>
|
166 |
+
# <p><strong>Classification:</strong> {classification}</p>
|
167 |
+
# <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
|
168 |
+
# """
|
169 |
+
|
170 |
+
# return result_html, static_graph_path, interactive_fig
|
171 |
+
# except Exception as e:
|
172 |
+
# return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
173 |
+
|
174 |
+
|
175 |
+
# # Gradio app instance
|
176 |
+
# with gr.Blocks() as demo:
|
177 |
+
# gr.Markdown(
|
178 |
+
# """
|
179 |
+
# # π Research Paper Classifier with Knowledge Graphs
|
180 |
+
# Upload a PDF research paper, and the app will:
|
181 |
+
# 1. **Generate a Static Knowledge Graph**
|
182 |
+
# 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details)
|
183 |
+
# 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable**
|
184 |
+
# """
|
185 |
+
# )
|
186 |
+
|
187 |
+
# with gr.Row():
|
188 |
+
# pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
|
189 |
+
# submit_btn = gr.Button("Classify Paper")
|
190 |
+
|
191 |
+
# with gr.Row():
|
192 |
+
# result_output = gr.HTML(label="Classification Result")
|
193 |
+
|
194 |
+
# with gr.Row():
|
195 |
+
# static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
|
196 |
+
# interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")
|
197 |
+
|
198 |
+
# submit_btn.click(
|
199 |
+
# fn=classify_and_visualize_pdf,
|
200 |
+
# inputs=pdf_input,
|
201 |
+
# outputs=[result_output, static_graph_output, interactive_graph_output],
|
202 |
+
# )
|
203 |
+
|
204 |
+
# demo.launch()
|
205 |
import gradio as gr
|
206 |
import pdfplumber
|
207 |
import networkx as nx
|
|
|
221 |
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
222 |
llm_transformer = LLMGraphTransformer(llm=llm)
|
223 |
|
|
|
224 |
def extract_text_from_pdf(pdf_path):
|
225 |
with pdfplumber.open(pdf_path) as pdf:
|
226 |
extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
227 |
return extracted_text
|
228 |
|
|
|
229 |
def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
230 |
tokens = tokenizer.tokenize(text)
|
231 |
chunks = [
|
|
|
234 |
]
|
235 |
return chunks
|
236 |
|
|
|
237 |
def process_text_with_llm(text):
|
238 |
chunks = scibert_chunking(text)
|
239 |
documents = [Document(page_content=chunk) for chunk in chunks]
|
|
|
242 |
]
|
243 |
return graph_documents
|
244 |
|
|
|
245 |
def build_graph(graph_documents):
|
246 |
graph = nx.DiGraph()
|
247 |
for graph_doc in graph_documents:
|
|
|
252 |
graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
253 |
return graph
|
254 |
|
|
|
255 |
def calculate_average_top_5_pagerank(graph):
|
256 |
pagerank = nx.pagerank(graph)
|
257 |
top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
258 |
return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
259 |
|
|
|
260 |
def draw_static_graph(graph, output_path="graph.png"):
|
261 |
plt.figure(figsize=(10, 8))
|
262 |
pos = nx.spring_layout(graph, seed=42)
|
|
|
274 |
plt.savefig(output_path)
|
275 |
plt.close()
|
276 |
|
|
|
277 |
def generate_interactive_plotly_graph(graph, pagerank):
|
278 |
pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
279 |
|
|
|
336 |
|
337 |
return fig
|
338 |
|
|
|
339 |
def classify_and_visualize_pdf(pdf_path):
|
340 |
try:
|
341 |
# Step 1: Extract text from the PDF
|
|
|
367 |
except Exception as e:
|
368 |
return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
369 |
|
|
|
370 |
# Gradio app instance
|
371 |
+
with gr.Blocks(css="""
|
372 |
+
body {background: linear-gradient(to right, #6A11CB, #2575FC); color: white;}
|
373 |
+
.gr-button {background: #34A853; color: white; border-radius: 8px;}
|
374 |
+
.gr-button:hover {background: #2F8A43;}
|
375 |
+
.gr-markdown {font-family: 'Roboto', sans-serif; text-align: center;}
|
376 |
+
.gr-file-upload {border: 2px dashed #fff;}
|
377 |
+
.gr-row {padding: 10px; justify-content: center; align-items: center;}
|
378 |
+
""") as demo:
|
379 |
gr.Markdown(
|
380 |
"""
|
381 |
# π Research Paper Classifier with Knowledge Graphs
|
|
|
387 |
)
|
388 |
|
389 |
with gr.Row():
|
390 |
+
pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
|
391 |
+
submit_btn = gr.Button("Classify Paper", elem_id="submit")
|
392 |
|
393 |
with gr.Row():
|
394 |
result_output = gr.HTML(label="Classification Result")
|