Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# import gradio as gr
|
2 |
# import pdfplumber
|
3 |
# import networkx as nx
|
@@ -17,13 +221,11 @@
|
|
17 |
# llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
18 |
# llm_transformer = LLMGraphTransformer(llm=llm)
|
19 |
|
20 |
-
|
21 |
# def extract_text_from_pdf(pdf_path):
|
22 |
# with pdfplumber.open(pdf_path) as pdf:
|
23 |
# extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
24 |
# return extracted_text
|
25 |
|
26 |
-
|
27 |
# def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
28 |
# tokens = tokenizer.tokenize(text)
|
29 |
# chunks = [
|
@@ -32,7 +234,6 @@
|
|
32 |
# ]
|
33 |
# return chunks
|
34 |
|
35 |
-
|
36 |
# def process_text_with_llm(text):
|
37 |
# chunks = scibert_chunking(text)
|
38 |
# documents = [Document(page_content=chunk) for chunk in chunks]
|
@@ -41,7 +242,6 @@
|
|
41 |
# ]
|
42 |
# return graph_documents
|
43 |
|
44 |
-
|
45 |
# def build_graph(graph_documents):
|
46 |
# graph = nx.DiGraph()
|
47 |
# for graph_doc in graph_documents:
|
@@ -52,13 +252,11 @@
|
|
52 |
# graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
53 |
# return graph
|
54 |
|
55 |
-
|
56 |
# def calculate_average_top_5_pagerank(graph):
|
57 |
# pagerank = nx.pagerank(graph)
|
58 |
# top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
59 |
# return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
60 |
|
61 |
-
|
62 |
# def draw_static_graph(graph, output_path="graph.png"):
|
63 |
# plt.figure(figsize=(10, 8))
|
64 |
# pos = nx.spring_layout(graph, seed=42)
|
@@ -76,7 +274,6 @@
|
|
76 |
# plt.savefig(output_path)
|
77 |
# plt.close()
|
78 |
|
79 |
-
|
80 |
# def generate_interactive_plotly_graph(graph, pagerank):
|
81 |
# pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
82 |
|
@@ -139,7 +336,6 @@
|
|
139 |
|
140 |
# return fig
|
141 |
|
142 |
-
|
143 |
# def classify_and_visualize_pdf(pdf_path):
|
144 |
# try:
|
145 |
# # Step 1: Extract text from the PDF
|
@@ -171,9 +367,15 @@
|
|
171 |
# except Exception as e:
|
172 |
# return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
173 |
|
174 |
-
|
175 |
# # Gradio app instance
|
176 |
-
# with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
# gr.Markdown(
|
178 |
# """
|
179 |
# # π Research Paper Classifier with Knowledge Graphs
|
@@ -185,8 +387,8 @@
|
|
185 |
# )
|
186 |
|
187 |
# with gr.Row():
|
188 |
-
# pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
|
189 |
-
# submit_btn = gr.Button("Classify Paper")
|
190 |
|
191 |
# with gr.Row():
|
192 |
# result_output = gr.HTML(label="Classification Result")
|
@@ -369,12 +571,45 @@ def classify_and_visualize_pdf(pdf_path):
|
|
369 |
|
370 |
# Gradio app instance
|
371 |
with gr.Blocks(css="""
|
372 |
-
body {
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
.gr-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
""") as demo:
|
379 |
gr.Markdown(
|
380 |
"""
|
|
|
1 |
+
# # import gradio as gr
|
2 |
+
# # import pdfplumber
|
3 |
+
# # import networkx as nx
|
4 |
+
# # import pandas as pd
|
5 |
+
# # import matplotlib.pyplot as plt
|
6 |
+
# # import plotly.graph_objects as go
|
7 |
+
# # from transformers import AutoTokenizer
|
8 |
+
# # from langchain_core.documents import Document
|
9 |
+
# # from langchain_experimental.graph_transformers import LLMGraphTransformer
|
10 |
+
# # from langchain_groq import ChatGroq
|
11 |
+
# # import os
|
12 |
+
|
13 |
+
# # # Initialize components
|
14 |
+
# # scibert_model = "allenai/scibert_scivocab_uncased"
|
15 |
+
# # tokenizer = AutoTokenizer.from_pretrained(scibert_model)
|
16 |
+
# # groq_api_key = "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"
|
17 |
+
# # llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
18 |
+
# # llm_transformer = LLMGraphTransformer(llm=llm)
|
19 |
+
|
20 |
+
|
21 |
+
# # def extract_text_from_pdf(pdf_path):
|
22 |
+
# # with pdfplumber.open(pdf_path) as pdf:
|
23 |
+
# # extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
24 |
+
# # return extracted_text
|
25 |
+
|
26 |
+
|
27 |
+
# # def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
28 |
+
# # tokens = tokenizer.tokenize(text)
|
29 |
+
# # chunks = [
|
30 |
+
# # tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size])
|
31 |
+
# # for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size)
|
32 |
+
# # ]
|
33 |
+
# # return chunks
|
34 |
+
|
35 |
+
|
36 |
+
# # def process_text_with_llm(text):
|
37 |
+
# # chunks = scibert_chunking(text)
|
38 |
+
# # documents = [Document(page_content=chunk) for chunk in chunks]
|
39 |
+
# # graph_documents = [
|
40 |
+
# # llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents
|
41 |
+
# # ]
|
42 |
+
# # return graph_documents
|
43 |
+
|
44 |
+
|
45 |
+
# # def build_graph(graph_documents):
|
46 |
+
# # graph = nx.DiGraph()
|
47 |
+
# # for graph_doc in graph_documents:
|
48 |
+
# # for node in graph_doc.nodes:
|
49 |
+
# # label = node.properties.get("name", node.id)
|
50 |
+
# # graph.add_node(node.id, label=label)
|
51 |
+
# # for rel in graph_doc.relationships:
|
52 |
+
# # graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
53 |
+
# # return graph
|
54 |
+
|
55 |
+
|
56 |
+
# # def calculate_average_top_5_pagerank(graph):
|
57 |
+
# # pagerank = nx.pagerank(graph)
|
58 |
+
# # top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
59 |
+
# # return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
60 |
+
|
61 |
+
|
62 |
+
# # def draw_static_graph(graph, output_path="graph.png"):
|
63 |
+
# # plt.figure(figsize=(10, 8))
|
64 |
+
# # pos = nx.spring_layout(graph, seed=42)
|
65 |
+
# # nx.draw(
|
66 |
+
# # graph,
|
67 |
+
# # pos,
|
68 |
+
# # with_labels=True,
|
69 |
+
# # node_size=500,
|
70 |
+
# # node_color="lightblue",
|
71 |
+
# # font_size=8,
|
72 |
+
# # font_weight="bold",
|
73 |
+
# # edge_color="gray",
|
74 |
+
# # )
|
75 |
+
# # plt.title("Static Knowledge Graph")
|
76 |
+
# # plt.savefig(output_path)
|
77 |
+
# # plt.close()
|
78 |
+
|
79 |
+
|
80 |
+
# # def generate_interactive_plotly_graph(graph, pagerank):
|
81 |
+
# # pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
82 |
+
|
83 |
+
# # edge_x = []
|
84 |
+
# # edge_y = []
|
85 |
+
# # for edge in graph.edges():
|
86 |
+
# # x0, y0 = pos[edge[0]]
|
87 |
+
# # x1, y1 = pos[edge[1]]
|
88 |
+
# # edge_x.extend([x0, x1, None])
|
89 |
+
# # edge_y.extend([y0, y1, None])
|
90 |
+
|
91 |
+
# # edge_trace = go.Scatter(
|
92 |
+
# # x=edge_x,
|
93 |
+
# # y=edge_y,
|
94 |
+
# # line=dict(width=0.5, color="#888"),
|
95 |
+
# # hoverinfo="none",
|
96 |
+
# # mode="lines",
|
97 |
+
# # )
|
98 |
+
|
99 |
+
# # node_x = []
|
100 |
+
# # node_y = []
|
101 |
+
# # node_text = []
|
102 |
+
# # for node in graph.nodes():
|
103 |
+
# # x, y = pos[node]
|
104 |
+
# # node_x.append(x)
|
105 |
+
# # node_y.append(y)
|
106 |
+
|
107 |
+
# # label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing
|
108 |
+
# # pagerank_score = pagerank.get(node, 0)
|
109 |
+
# # node_text.append(f"{label}<br>{pagerank_score:.4f}")
|
110 |
+
|
111 |
+
# # node_trace = go.Scatter(
|
112 |
+
# # x=node_x,
|
113 |
+
# # y=node_y,
|
114 |
+
# # mode="markers+text",
|
115 |
+
# # text=node_text,
|
116 |
+
# # hoverinfo="text",
|
117 |
+
# # marker=dict(
|
118 |
+
# # showscale=True,
|
119 |
+
# # colorscale="YlGnBu",
|
120 |
+
# # size=10,
|
121 |
+
# # color=list(pagerank.values()),
|
122 |
+
# # colorbar=dict(
|
123 |
+
# # thickness=15,
|
124 |
+
# # title="PageRank",
|
125 |
+
# # xanchor="left",
|
126 |
+
# # titleside="right"
|
127 |
+
# # ),
|
128 |
+
# # ),
|
129 |
+
# # )
|
130 |
+
|
131 |
+
# # fig = go.Figure(data=[edge_trace, node_trace])
|
132 |
+
# # fig.update_layout(
|
133 |
+
# # showlegend=False,
|
134 |
+
# # hovermode="closest",
|
135 |
+
# # margin=dict(b=0, l=0, r=0, t=0),
|
136 |
+
# # xaxis=dict(showgrid=False, zeroline=False),
|
137 |
+
# # yaxis=dict(showgrid=False, zeroline=False),
|
138 |
+
# # )
|
139 |
+
|
140 |
+
# # return fig
|
141 |
+
|
142 |
+
|
143 |
+
# # def classify_and_visualize_pdf(pdf_path):
|
144 |
+
# # try:
|
145 |
+
# # # Step 1: Extract text from the PDF
|
146 |
+
# # text = extract_text_from_pdf(pdf_path)
|
147 |
+
|
148 |
+
# # # Step 2: Process text to generate a knowledge graph
|
149 |
+
# # graph_documents = process_text_with_llm(text)
|
150 |
+
# # graph = build_graph(graph_documents)
|
151 |
+
|
152 |
+
# # # Step 3: Calculate PageRank and classify
|
153 |
+
# # avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph)
|
154 |
+
# # classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable"
|
155 |
+
|
156 |
+
# # # Step 4: Draw the static graph and save as image
|
157 |
+
# # static_graph_path = "knowledge_graph.png"
|
158 |
+
# # draw_static_graph(graph, static_graph_path)
|
159 |
+
|
160 |
+
# # # Step 5: Generate the interactive Plotly graph
|
161 |
+
# # interactive_fig = generate_interactive_plotly_graph(graph, pagerank)
|
162 |
+
|
163 |
+
# # # Step 6: Prepare formatted result
|
164 |
+
# # result_html = f"""
|
165 |
+
# # <h3>Classification Result</h3>
|
166 |
+
# # <p><strong>Classification:</strong> {classification}</p>
|
167 |
+
# # <p><strong>Average Top 5 PageRank:</strong> {avg_top_5_pagerank:.4f}</p>
|
168 |
+
# # """
|
169 |
+
|
170 |
+
# # return result_html, static_graph_path, interactive_fig
|
171 |
+
# # except Exception as e:
|
172 |
+
# # return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
173 |
+
|
174 |
+
|
175 |
+
# # # Gradio app instance
|
176 |
+
# # with gr.Blocks() as demo:
|
177 |
+
# # gr.Markdown(
|
178 |
+
# # """
|
179 |
+
# # # π Research Paper Classifier with Knowledge Graphs
|
180 |
+
# # Upload a PDF research paper, and the app will:
|
181 |
+
# # 1. **Generate a Static Knowledge Graph**
|
182 |
+
# # 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details)
|
183 |
+
# # 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable**
|
184 |
+
# # """
|
185 |
+
# # )
|
186 |
+
|
187 |
+
# # with gr.Row():
|
188 |
+
# # pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
|
189 |
+
# # submit_btn = gr.Button("Classify Paper")
|
190 |
+
|
191 |
+
# # with gr.Row():
|
192 |
+
# # result_output = gr.HTML(label="Classification Result")
|
193 |
+
|
194 |
+
# # with gr.Row():
|
195 |
+
# # static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath")
|
196 |
+
# # interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph")
|
197 |
+
|
198 |
+
# # submit_btn.click(
|
199 |
+
# # fn=classify_and_visualize_pdf,
|
200 |
+
# # inputs=pdf_input,
|
201 |
+
# # outputs=[result_output, static_graph_output, interactive_graph_output],
|
202 |
+
# # )
|
203 |
+
|
204 |
+
# # demo.launch()
|
205 |
# import gradio as gr
|
206 |
# import pdfplumber
|
207 |
# import networkx as nx
|
|
|
221 |
# llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
|
222 |
# llm_transformer = LLMGraphTransformer(llm=llm)
|
223 |
|
|
|
224 |
# def extract_text_from_pdf(pdf_path):
|
225 |
# with pdfplumber.open(pdf_path) as pdf:
|
226 |
# extracted_text = "".join([page.extract_text() for page in pdf.pages])
|
227 |
# return extracted_text
|
228 |
|
|
|
229 |
# def scibert_chunking(text, chunk_size=256, max_chunks=6):
|
230 |
# tokens = tokenizer.tokenize(text)
|
231 |
# chunks = [
|
|
|
234 |
# ]
|
235 |
# return chunks
|
236 |
|
|
|
237 |
# def process_text_with_llm(text):
|
238 |
# chunks = scibert_chunking(text)
|
239 |
# documents = [Document(page_content=chunk) for chunk in chunks]
|
|
|
242 |
# ]
|
243 |
# return graph_documents
|
244 |
|
|
|
245 |
# def build_graph(graph_documents):
|
246 |
# graph = nx.DiGraph()
|
247 |
# for graph_doc in graph_documents:
|
|
|
252 |
# graph.add_edge(rel.source.id, rel.target.id, type=rel.type)
|
253 |
# return graph
|
254 |
|
|
|
255 |
# def calculate_average_top_5_pagerank(graph):
|
256 |
# pagerank = nx.pagerank(graph)
|
257 |
# top_5 = sorted(pagerank.values(), reverse=True)[:5]
|
258 |
# return sum(top_5) / len(top_5) if top_5 else 0, pagerank
|
259 |
|
|
|
260 |
# def draw_static_graph(graph, output_path="graph.png"):
|
261 |
# plt.figure(figsize=(10, 8))
|
262 |
# pos = nx.spring_layout(graph, seed=42)
|
|
|
274 |
# plt.savefig(output_path)
|
275 |
# plt.close()
|
276 |
|
|
|
277 |
# def generate_interactive_plotly_graph(graph, pagerank):
|
278 |
# pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes
|
279 |
|
|
|
336 |
|
337 |
# return fig
|
338 |
|
|
|
339 |
# def classify_and_visualize_pdf(pdf_path):
|
340 |
# try:
|
341 |
# # Step 1: Extract text from the PDF
|
|
|
367 |
# except Exception as e:
|
368 |
# return f"<p style='color:red;'><strong>Error:</strong> {str(e)}</p>", None, None
|
369 |
|
|
|
370 |
# # Gradio app instance
|
371 |
+
# with gr.Blocks(css="""
|
372 |
+
# body {background: linear-gradient(to right, #6A11CB, #2575FC); color: white;}
|
373 |
+
# .gr-button {background: #34A853; color: white; border-radius: 8px;}
|
374 |
+
# .gr-button:hover {background: #2F8A43;}
|
375 |
+
# .gr-markdown {font-family: 'Roboto', sans-serif; text-align: center;}
|
376 |
+
# .gr-file-upload {border: 2px dashed #fff;}
|
377 |
+
# .gr-row {padding: 10px; justify-content: center; align-items: center;}
|
378 |
+
# """) as demo:
|
379 |
# gr.Markdown(
|
380 |
# """
|
381 |
# # π Research Paper Classifier with Knowledge Graphs
|
|
|
387 |
# )
|
388 |
|
389 |
# with gr.Row():
|
390 |
+
# pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload")
|
391 |
+
# submit_btn = gr.Button("Classify Paper", elem_id="submit")
|
392 |
|
393 |
# with gr.Row():
|
394 |
# result_output = gr.HTML(label="Classification Result")
|
|
|
571 |
|
572 |
# Gradio app instance
|
573 |
with gr.Blocks(css="""
|
574 |
+
body {
|
575 |
+
background: linear-gradient(to right, #6A11CB, #2575FC);
|
576 |
+
color: white;
|
577 |
+
font-family: 'Poppins', sans-serif;
|
578 |
+
}
|
579 |
+
.gr-button {
|
580 |
+
background: #34A853;
|
581 |
+
color: white;
|
582 |
+
border-radius: 8px;
|
583 |
+
padding: 10px 20px;
|
584 |
+
font-size: 16px;
|
585 |
+
transition: background 0.3s ease;
|
586 |
+
}
|
587 |
+
.gr-button:hover {
|
588 |
+
background: #2F8A43;
|
589 |
+
transform: scale(1.05);
|
590 |
+
}
|
591 |
+
.gr-markdown {
|
592 |
+
font-family: 'Poppins', sans-serif;
|
593 |
+
text-align: center;
|
594 |
+
font-size: 18px;
|
595 |
+
padding: 10px;
|
596 |
+
background: rgba(255, 255, 255, 0.2);
|
597 |
+
border-radius: 10px;
|
598 |
+
}
|
599 |
+
.gr-file-upload {
|
600 |
+
border: 2px dashed #fff;
|
601 |
+
padding: 20px;
|
602 |
+
border-radius: 10px;
|
603 |
+
transition: border-color 0.3s ease;
|
604 |
+
}
|
605 |
+
.gr-file-upload:hover {
|
606 |
+
border-color: #34A853;
|
607 |
+
}
|
608 |
+
.gr-row {
|
609 |
+
padding: 10px;
|
610 |
+
justify-content: center;
|
611 |
+
align-items: center;
|
612 |
+
}
|
613 |
""") as demo:
|
614 |
gr.Markdown(
|
615 |
"""
|