# # import gradio as gr # # import pdfplumber # # import networkx as nx # # import pandas as pd # # import matplotlib.pyplot as plt # # import plotly.graph_objects as go # # from transformers import AutoTokenizer # # from langchain_core.documents import Document # # from langchain_experimental.graph_transformers import LLMGraphTransformer # # from langchain_groq import ChatGroq # # import os # # # Initialize components # # scibert_model = "allenai/scibert_scivocab_uncased" # # tokenizer = AutoTokenizer.from_pretrained(scibert_model) # # groq_api_key = "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" # # llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It") # # llm_transformer = LLMGraphTransformer(llm=llm) # # def extract_text_from_pdf(pdf_path): # # with pdfplumber.open(pdf_path) as pdf: # # extracted_text = "".join([page.extract_text() for page in pdf.pages]) # # return extracted_text # # def scibert_chunking(text, chunk_size=256, max_chunks=6): # # tokens = tokenizer.tokenize(text) # # chunks = [ # # tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size]) # # for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size) # # ] # # return chunks # # def process_text_with_llm(text): # # chunks = scibert_chunking(text) # # documents = [Document(page_content=chunk) for chunk in chunks] # # graph_documents = [ # # llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents # # ] # # return graph_documents # # def build_graph(graph_documents): # # graph = nx.DiGraph() # # for graph_doc in graph_documents: # # for node in graph_doc.nodes: # # label = node.properties.get("name", node.id) # # graph.add_node(node.id, label=label) # # for rel in graph_doc.relationships: # # graph.add_edge(rel.source.id, rel.target.id, type=rel.type) # # return graph # # def calculate_average_top_5_pagerank(graph): # # pagerank = nx.pagerank(graph) # # top_5 = sorted(pagerank.values(), reverse=True)[:5] # # return sum(top_5) / len(top_5) if top_5 else 0, pagerank # # def draw_static_graph(graph, output_path="graph.png"): # # plt.figure(figsize=(10, 8)) # # pos = nx.spring_layout(graph, seed=42) # # nx.draw( # # graph, # # pos, # # with_labels=True, # # node_size=500, # # node_color="lightblue", # # font_size=8, # # font_weight="bold", # # edge_color="gray", # # ) # # plt.title("Static Knowledge Graph") # # plt.savefig(output_path) # # plt.close() # # def generate_interactive_plotly_graph(graph, pagerank): # # pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes # # edge_x = [] # # edge_y = [] # # for edge in graph.edges(): # # x0, y0 = pos[edge[0]] # # x1, y1 = pos[edge[1]] # # edge_x.extend([x0, x1, None]) # # edge_y.extend([y0, y1, None]) # # edge_trace = go.Scatter( # # x=edge_x, # # y=edge_y, # # line=dict(width=0.5, color="#888"), # # hoverinfo="none", # # mode="lines", # # ) # # node_x = [] # # node_y = [] # # node_text = [] # # for node in graph.nodes(): # # x, y = pos[node] # # node_x.append(x) # # node_y.append(y) # # label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing # # pagerank_score = pagerank.get(node, 0) # # node_text.append(f"{label}
{pagerank_score:.4f}") # # node_trace = go.Scatter( # # x=node_x, # # y=node_y, # # mode="markers+text", # # text=node_text, # # hoverinfo="text", # # marker=dict( # # showscale=True, # # colorscale="YlGnBu", # # size=10, # # color=list(pagerank.values()), # # colorbar=dict( # # thickness=15, # # title="PageRank", # # xanchor="left", # # titleside="right" # # ), # # ), # # ) # # fig = go.Figure(data=[edge_trace, node_trace]) # # fig.update_layout( # # showlegend=False, # # hovermode="closest", # # margin=dict(b=0, l=0, r=0, t=0), # # xaxis=dict(showgrid=False, zeroline=False), # # yaxis=dict(showgrid=False, zeroline=False), # # ) # # return fig # # def classify_and_visualize_pdf(pdf_path): # # try: # # # Step 1: Extract text from the PDF # # text = extract_text_from_pdf(pdf_path) # # # Step 2: Process text to generate a knowledge graph # # graph_documents = process_text_with_llm(text) # # graph = build_graph(graph_documents) # # # Step 3: Calculate PageRank and classify # # avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph) # # classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable" # # # Step 4: Draw the static graph and save as image # # static_graph_path = "knowledge_graph.png" # # draw_static_graph(graph, static_graph_path) # # # Step 5: Generate the interactive Plotly graph # # interactive_fig = generate_interactive_plotly_graph(graph, pagerank) # # # Step 6: Prepare formatted result # # result_html = f""" # #

Classification Result

# #

Classification: {classification}

# #

Average Top 5 PageRank: {avg_top_5_pagerank:.4f}

# # """ # # return result_html, static_graph_path, interactive_fig # # except Exception as e: # # return f"

Error: {str(e)}

", None, None # # # Gradio app instance # # with gr.Blocks() as demo: # # gr.Markdown( # # """ # # # 📄 Research Paper Classifier with Knowledge Graphs # # Upload a PDF research paper, and the app will: # # 1. **Generate a Static Knowledge Graph** # # 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details) # # 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable** # # """ # # ) # # with gr.Row(): # # pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"]) # # submit_btn = gr.Button("Classify Paper") # # with gr.Row(): # # result_output = gr.HTML(label="Classification Result") # # with gr.Row(): # # static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath") # # interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph") # # submit_btn.click( # # fn=classify_and_visualize_pdf, # # inputs=pdf_input, # # outputs=[result_output, static_graph_output, interactive_graph_output], # # ) # # demo.launch() # import gradio as gr # import pdfplumber # import networkx as nx # import pandas as pd # import matplotlib.pyplot as plt # import plotly.graph_objects as go # from transformers import AutoTokenizer # from langchain_core.documents import Document # from langchain_experimental.graph_transformers import LLMGraphTransformer # from langchain_groq import ChatGroq # import os # # Initialize components # scibert_model = "allenai/scibert_scivocab_uncased" # tokenizer = AutoTokenizer.from_pretrained(scibert_model) # groq_api_key = "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" # llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It") # llm_transformer = LLMGraphTransformer(llm=llm) # def extract_text_from_pdf(pdf_path): # with pdfplumber.open(pdf_path) as pdf: # extracted_text = "".join([page.extract_text() for page in pdf.pages]) # return extracted_text # def scibert_chunking(text, chunk_size=256, max_chunks=6): # tokens = tokenizer.tokenize(text) # chunks = [ # tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size]) # for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size) # ] # return chunks # def process_text_with_llm(text): # chunks = scibert_chunking(text) # documents = [Document(page_content=chunk) for chunk in chunks] # graph_documents = [ # llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents # ] # return graph_documents # def build_graph(graph_documents): # graph = nx.DiGraph() # for graph_doc in graph_documents: # for node in graph_doc.nodes: # label = node.properties.get("name", node.id) # graph.add_node(node.id, label=label) # for rel in graph_doc.relationships: # graph.add_edge(rel.source.id, rel.target.id, type=rel.type) # return graph # def calculate_average_top_5_pagerank(graph): # pagerank = nx.pagerank(graph) # top_5 = sorted(pagerank.values(), reverse=True)[:5] # return sum(top_5) / len(top_5) if top_5 else 0, pagerank # def draw_static_graph(graph, output_path="graph.png"): # plt.figure(figsize=(10, 8)) # pos = nx.spring_layout(graph, seed=42) # nx.draw( # graph, # pos, # with_labels=True, # node_size=500, # node_color="lightblue", # font_size=8, # font_weight="bold", # edge_color="gray", # ) # plt.title("Static Knowledge Graph") # plt.savefig(output_path) # plt.close() # def generate_interactive_plotly_graph(graph, pagerank): # pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes # edge_x = [] # edge_y = [] # for edge in graph.edges(): # x0, y0 = pos[edge[0]] # x1, y1 = pos[edge[1]] # edge_x.extend([x0, x1, None]) # edge_y.extend([y0, y1, None]) # edge_trace = go.Scatter( # x=edge_x, # y=edge_y, # line=dict(width=0.5, color="#888"), # hoverinfo="none", # mode="lines", # ) # node_x = [] # node_y = [] # node_text = [] # for node in graph.nodes(): # x, y = pos[node] # node_x.append(x) # node_y.append(y) # label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing # pagerank_score = pagerank.get(node, 0) # node_text.append(f"{label}
{pagerank_score:.4f}") # node_trace = go.Scatter( # x=node_x, # y=node_y, # mode="markers+text", # text=node_text, # hoverinfo="text", # marker=dict( # showscale=True, # colorscale="YlGnBu", # size=10, # color=list(pagerank.values()), # colorbar=dict( # thickness=15, # title="PageRank", # xanchor="left", # titleside="right" # ), # ), # ) # fig = go.Figure(data=[edge_trace, node_trace]) # fig.update_layout( # showlegend=False, # hovermode="closest", # margin=dict(b=0, l=0, r=0, t=0), # xaxis=dict(showgrid=False, zeroline=False), # yaxis=dict(showgrid=False, zeroline=False), # ) # return fig # def classify_and_visualize_pdf(pdf_path): # try: # # Step 1: Extract text from the PDF # text = extract_text_from_pdf(pdf_path) # # Step 2: Process text to generate a knowledge graph # graph_documents = process_text_with_llm(text) # graph = build_graph(graph_documents) # # Step 3: Calculate PageRank and classify # avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph) # classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable" # # Step 4: Draw the static graph and save as image # static_graph_path = "knowledge_graph.png" # draw_static_graph(graph, static_graph_path) # # Step 5: Generate the interactive Plotly graph # interactive_fig = generate_interactive_plotly_graph(graph, pagerank) # # Step 6: Prepare formatted result # result_html = f""" #

Classification Result

#

Classification: {classification}

#

Average Top 5 PageRank: {avg_top_5_pagerank:.4f}

# """ # return result_html, static_graph_path, interactive_fig # except Exception as e: # return f"

Error: {str(e)}

", None, None # # Gradio app instance # with gr.Blocks(css=""" # body {background: linear-gradient(to right, #6A11CB, #2575FC); color: white;} # .gr-button {background: #34A853; color: white; border-radius: 8px;} # .gr-button:hover {background: #2F8A43;} # .gr-markdown {font-family: 'Roboto', sans-serif; text-align: center;} # .gr-file-upload {border: 2px dashed #fff;} # .gr-row {padding: 10px; justify-content: center; align-items: center;} # """) as demo: # gr.Markdown( # """ # # 📄 Research Paper Classifier with Knowledge Graphs # Upload a PDF research paper, and the app will: # 1. **Generate a Static Knowledge Graph** # 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details) # 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable** # """ # ) # with gr.Row(): # pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload") # submit_btn = gr.Button("Classify Paper", elem_id="submit") # with gr.Row(): # result_output = gr.HTML(label="Classification Result") # with gr.Row(): # static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath") # interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph") # submit_btn.click( # fn=classify_and_visualize_pdf, # inputs=pdf_input, # outputs=[result_output, static_graph_output, interactive_graph_output], # ) # demo.launch() import gradio as gr import pdfplumber import networkx as nx import pandas as pd import matplotlib.pyplot as plt import plotly.graph_objects as go from transformers import AutoTokenizer from langchain_core.documents import Document from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_groq import ChatGroq import os # Initialize components scibert_model = "allenai/scibert_scivocab_uncased" tokenizer = AutoTokenizer.from_pretrained(scibert_model) groq_api_key = "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" llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It") llm_transformer = LLMGraphTransformer(llm=llm) def extract_text_from_pdf(pdf_path): with pdfplumber.open(pdf_path) as pdf: extracted_text = "".join([page.extract_text() for page in pdf.pages]) return extracted_text def scibert_chunking(text, chunk_size=256, max_chunks=6): tokens = tokenizer.tokenize(text) chunks = [ tokenizer.convert_tokens_to_string(tokens[i : i + chunk_size]) for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size) ] return chunks def process_text_with_llm(text): chunks = scibert_chunking(text) documents = [Document(page_content=chunk) for chunk in chunks] graph_documents = [ llm_transformer.convert_to_graph_documents([doc])[0] for doc in documents ] return graph_documents def build_graph(graph_documents): graph = nx.DiGraph() for graph_doc in graph_documents: for node in graph_doc.nodes: label = node.properties.get("name", node.id) graph.add_node(node.id, label=label) for rel in graph_doc.relationships: graph.add_edge(rel.source.id, rel.target.id, type=rel.type) return graph def calculate_average_top_5_pagerank(graph): pagerank = nx.pagerank(graph) top_5 = sorted(pagerank.values(), reverse=True)[:5] return sum(top_5) / len(top_5) if top_5 else 0, pagerank def draw_static_graph(graph, output_path="graph.png"): plt.figure(figsize=(10, 8)) pos = nx.spring_layout(graph, seed=42) nx.draw( graph, pos, with_labels=True, node_size=500, node_color="lightblue", font_size=8, font_weight="bold", edge_color="gray", ) plt.title("Static Knowledge Graph") plt.savefig(output_path) plt.close() def generate_interactive_plotly_graph(graph, pagerank): pos = nx.spring_layout(graph, seed=42) # Generate positions for nodes edge_x = [] edge_y = [] for edge in graph.edges(): x0, y0 = pos[edge[0]] x1, y1 = pos[edge[1]] edge_x.extend([x0, x1, None]) edge_y.extend([y0, y1, None]) edge_trace = go.Scatter( x=edge_x, y=edge_y, line=dict(width=0.5, color="#888"), hoverinfo="none", mode="lines", ) node_x = [] node_y = [] node_text = [] for node in graph.nodes(): x, y = pos[node] node_x.append(x) node_y.append(y) label = graph.nodes[node].get("label", str(node)) # Default to node ID if label is missing pagerank_score = pagerank.get(node, 0) node_text.append(f"{label}
{pagerank_score:.4f}") node_trace = go.Scatter( x=node_x, y=node_y, mode="markers+text", text=node_text, hoverinfo="text", marker=dict( showscale=True, colorscale="YlGnBu", size=10, color=list(pagerank.values()), colorbar=dict( thickness=15, title="PageRank", xanchor="left", titleside="right" ), ), ) fig = go.Figure(data=[edge_trace, node_trace]) fig.update_layout( showlegend=False, hovermode="closest", margin=dict(b=0, l=0, r=0, t=0), xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=False, zeroline=False), ) return fig def classify_and_visualize_pdf(pdf_path): try: # Step 1: Extract text from the PDF text = extract_text_from_pdf(pdf_path) # Step 2: Process text to generate a knowledge graph graph_documents = process_text_with_llm(text) graph = build_graph(graph_documents) # Step 3: Calculate PageRank and classify avg_top_5_pagerank, pagerank = calculate_average_top_5_pagerank(graph) classification = "Publishable" if avg_top_5_pagerank >= 0.0526 else "Non-Publishable" # Step 4: Draw the static graph and save as image static_graph_path = "knowledge_graph.png" draw_static_graph(graph, static_graph_path) # Step 5: Generate the interactive Plotly graph interactive_fig = generate_interactive_plotly_graph(graph, pagerank) # Step 6: Prepare formatted result result_html = f"""

Classification Result

Classification: {classification}

Average Top 5 PageRank: {avg_top_5_pagerank:.4f}

""" return result_html, static_graph_path, interactive_fig except Exception as e: return f"

Error: {str(e)}

", None, None # Gradio app instance with gr.Blocks(css=""" body { background: linear-gradient(to right, #6A11CB, #2575FC); color: white; font-family: 'Poppins', sans-serif; } .gr-button { background: #34A853; color: white; border-radius: 8px; padding: 10px 20px; font-size: 16px; transition: background 0.3s ease; } .gr-button:hover { background: #2F8A43; transform: scale(1.05); } .gr-markdown { font-family: 'Poppins', sans-serif; text-align: center; font-size: 18px; padding: 10px; background: rgba(255, 255, 255, 0.2); border-radius: 10px; } .gr-file-upload { border: 2px dashed #fff; padding: 20px; border-radius: 10px; transition: border-color 0.3s ease; } .gr-file-upload:hover { border-color: #34A853; } .gr-row { padding: 10px; justify-content: center; align-items: center; } """) as demo: gr.Markdown( """ # 📄 Research Paper Classifier with Knowledge Graphs Upload a PDF research paper, and the app will: 1. **Generate a Static Knowledge Graph** 2. **Generate an Interactive Knowledge Graph** (hover over nodes to see details) 3. **Calculate PageRank and classify the paper as Publishable or Non-Publishable** """ ) with gr.Row(): pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"], elem_id="upload") submit_btn = gr.Button("Classify Paper", elem_id="submit") with gr.Row(): result_output = gr.HTML(label="Classification Result") with gr.Row(): static_graph_output = gr.Image(label="Static Knowledge Graph", type="filepath") interactive_graph_output = gr.Plot(label="Interactive Knowledge Graph") submit_btn.click( fn=classify_and_visualize_pdf, inputs=pdf_input, outputs=[result_output, static_graph_output, interactive_graph_output], ) demo.launch()