AD2000X commited on
Commit
dbf1e8d
·
verified ·
1 Parent(s): df261af

Update src/knowledge_graph.py

Browse files
Files changed (1) hide show
  1. src/knowledge_graph.py +919 -919
src/knowledge_graph.py CHANGED
@@ -1,920 +1,920 @@
1
- # src/knowledge_graph.py
2
-
3
- import networkx as nx
4
- from pyvis.network import Network
5
- import json
6
- from typing import Dict, List, Any, Optional, Set, Tuple
7
- import matplotlib.pyplot as plt
8
- import matplotlib.colors as mcolors
9
- from collections import defaultdict
10
-
11
- class KnowledgeGraph:
12
- """
13
- Handles the construction and visualization of knowledge graphs
14
- based on the ontology data.
15
- """
16
-
17
- def __init__(self, ontology_manager=None):
18
- """
19
- Initialize the knowledge graph handler.
20
-
21
- Args:
22
- ontology_manager: Optional ontology manager instance
23
- """
24
- self.ontology_manager = ontology_manager
25
- self.graph = None
26
-
27
- if ontology_manager:
28
- self.graph = ontology_manager.graph
29
-
30
- def build_visualization_graph(
31
- self,
32
- include_classes: bool = True,
33
- include_instances: bool = True,
34
- central_entity: Optional[str] = None,
35
- max_distance: int = 2,
36
- include_properties: bool = False
37
- ) -> nx.Graph:
38
- """
39
- Build a simplified graph for visualization purposes.
40
-
41
- Args:
42
- include_classes: Whether to include class nodes
43
- include_instances: Whether to include instance nodes
44
- central_entity: Optional central entity to focus the graph on
45
- max_distance: Maximum distance from central entity to include
46
- include_properties: Whether to include property nodes
47
-
48
- Returns:
49
- A NetworkX graph suitable for visualization
50
- """
51
- if not self.graph:
52
- return nx.Graph()
53
-
54
- # Create an undirected graph for visualization
55
- viz_graph = nx.Graph()
56
-
57
- # If we have a central entity, extract a subgraph around it
58
- if central_entity and central_entity in self.graph:
59
- # Get nodes within max_distance of central_entity
60
- nodes_to_include = set([central_entity])
61
- current_distance = 0
62
- current_layer = set([central_entity])
63
-
64
- while current_distance < max_distance:
65
- next_layer = set()
66
- for node in current_layer:
67
- # Get neighbors
68
- neighbors = set(self.graph.successors(node)).union(set(self.graph.predecessors(node)))
69
- next_layer.update(neighbors)
70
-
71
- nodes_to_include.update(next_layer)
72
- current_layer = next_layer
73
- current_distance += 1
74
-
75
- # Create subgraph
76
- subgraph = self.graph.subgraph(nodes_to_include)
77
- else:
78
- subgraph = self.graph
79
-
80
- # Add nodes to the visualization graph
81
- for node, data in subgraph.nodes(data=True):
82
- node_type = data.get("type")
83
-
84
- # Skip nodes based on configuration
85
- if node_type == "class" and not include_classes:
86
- continue
87
- if node_type == "instance" and not include_instances:
88
- continue
89
-
90
- # Get readable name for the node
91
- if node_type == "instance" and "properties" in data:
92
- label = data["properties"].get("name", node)
93
- else:
94
- label = node
95
-
96
- # Set node attributes for visualization
97
- viz_attrs = {
98
- "id": node,
99
- "label": label,
100
- "title": self._get_node_tooltip(node, data),
101
- "group": data.get("class_type", node_type),
102
- "shape": "dot" if node_type == "instance" else "diamond"
103
- }
104
-
105
- # Highlight central entity if specified
106
- if central_entity and node == central_entity:
107
- viz_attrs["color"] = "#ff7f0e" # Orange for central entity
108
- viz_attrs["size"] = 25 # Larger size for central entity
109
-
110
- # Add the node
111
- viz_graph.add_node(node, **viz_attrs)
112
-
113
- # Add property nodes if configured
114
- if include_properties and node_type == "instance" and "properties" in data:
115
- for prop_name, prop_value in data["properties"].items():
116
- # Create a property node
117
- prop_node_id = f"{node}_{prop_name}"
118
- prop_value_str = str(prop_value)
119
- if len(prop_value_str) > 20:
120
- prop_value_str = prop_value_str[:17] + "..."
121
-
122
- viz_graph.add_node(
123
- prop_node_id,
124
- id=prop_node_id,
125
- label=f"{prop_name}: {prop_value_str}",
126
- title=f"{prop_name}: {prop_value}",
127
- group="property",
128
- shape="ellipse",
129
- size=5
130
- )
131
-
132
- # Connect instance to property
133
- viz_graph.add_edge(node, prop_node_id, label="has_property", dashes=True)
134
-
135
- # Add edges to the visualization graph
136
- for source, target, data in subgraph.edges(data=True):
137
- # Only include edges between nodes that are in the viz_graph
138
- if source in viz_graph and target in viz_graph:
139
- # Skip property-related edges if we're manually creating them
140
- if include_properties and (
141
- source.startswith(target + "_") or target.startswith(source + "_")
142
- ):
143
- continue
144
-
145
- # Set edge attributes
146
- edge_type = data.get("type", "unknown")
147
-
148
- # Don't show subClassOf and instanceOf relationships if not explicitly requested
149
- if edge_type in ["subClassOf", "instanceOf"] and not include_classes:
150
- continue
151
-
152
- viz_graph.add_edge(source, target, label=edge_type, title=edge_type)
153
-
154
- return viz_graph
155
-
156
- def _get_node_tooltip(self, node_id: str, data: Dict) -> str:
157
- """Generate a tooltip for a node."""
158
- tooltip = f"<strong>ID:</strong> {node_id}<br>"
159
-
160
- node_type = data.get("type")
161
- if node_type:
162
- tooltip += f"<strong>Type:</strong> {node_type}<br>"
163
-
164
- if node_type == "instance":
165
- tooltip += f"<strong>Class:</strong> {data.get('class_type', 'unknown')}<br>"
166
-
167
- # Add properties
168
- if "properties" in data:
169
- tooltip += "<strong>Properties:</strong><br>"
170
- for key, value in data["properties"].items():
171
- tooltip += f"- {key}: {value}<br>"
172
-
173
- elif node_type == "class":
174
- tooltip += f"<strong>Description:</strong> {data.get('description', '')}<br>"
175
-
176
- # Add properties if available
177
- if "properties" in data:
178
- tooltip += "<strong>Properties:</strong> " + ", ".join(data["properties"]) + "<br>"
179
-
180
- return tooltip
181
-
182
- def generate_html_visualization(
183
- self,
184
- include_classes: bool = True,
185
- include_instances: bool = True,
186
- central_entity: Optional[str] = None,
187
- max_distance: int = 2,
188
- include_properties: bool = False,
189
- height: str = "600px",
190
- width: str = "100%",
191
- bgcolor: str = "#ffffff",
192
- font_color: str = "#000000",
193
- layout_algorithm: str = "force-directed"
194
- ) -> str:
195
- """
196
- Generate an HTML visualization of the knowledge graph.
197
-
198
- Args:
199
- include_classes: Whether to include class nodes
200
- include_instances: Whether to include instance nodes
201
- central_entity: Optional central entity to focus the graph on
202
- max_distance: Maximum distance from central entity to include
203
- include_properties: Whether to include property nodes
204
- height: Height of the visualization
205
- width: Width of the visualization
206
- bgcolor: Background color
207
- font_color: Font color
208
- layout_algorithm: Algorithm for layout ('force-directed', 'hierarchical', 'radial', 'circular')
209
-
210
- Returns:
211
- HTML string containing the visualization
212
- """
213
- # Build the visualization graph
214
- viz_graph = self.build_visualization_graph(
215
- include_classes=include_classes,
216
- include_instances=include_instances,
217
- central_entity=central_entity,
218
- max_distance=max_distance,
219
- include_properties=include_properties
220
- )
221
-
222
- # Create a PyVis network
223
- net = Network(height=height, width=width, bgcolor=bgcolor, font_color=font_color, directed=True)
224
-
225
- # Configure physics based on the selected layout algorithm
226
- if layout_algorithm == "force-directed":
227
- physics_options = {
228
- "enabled": True,
229
- "solver": "forceAtlas2Based",
230
- "forceAtlas2Based": {
231
- "gravitationalConstant": -50,
232
- "centralGravity": 0.01,
233
- "springLength": 100,
234
- "springConstant": 0.08
235
- },
236
- "stabilization": {
237
- "enabled": True,
238
- "iterations": 100
239
- }
240
- }
241
- elif layout_algorithm == "hierarchical":
242
- physics_options = {
243
- "enabled": True,
244
- "hierarchicalRepulsion": {
245
- "centralGravity": 0.0,
246
- "springLength": 100,
247
- "springConstant": 0.01,
248
- "nodeDistance": 120
249
- },
250
- "solver": "hierarchicalRepulsion",
251
- "stabilization": {
252
- "enabled": True,
253
- "iterations": 100
254
- }
255
- }
256
-
257
- # Set hierarchical layout
258
- net.set_options("""
259
- var options = {
260
- "layout": {
261
- "hierarchical": {
262
- "enabled": true,
263
- "direction": "UD",
264
- "sortMethod": "directed",
265
- "nodeSpacing": 150,
266
- "treeSpacing": 200
267
- }
268
- }
269
- }
270
- """)
271
- elif layout_algorithm == "radial":
272
- physics_options = {
273
- "enabled": True,
274
- "solver": "repulsion",
275
- "repulsion": {
276
- "nodeDistance": 120,
277
- "centralGravity": 0.2,
278
- "springLength": 200,
279
- "springConstant": 0.05
280
- },
281
- "stabilization": {
282
- "enabled": True,
283
- "iterations": 100
284
- }
285
- }
286
- elif layout_algorithm == "circular":
287
- physics_options = {
288
- "enabled": False
289
- }
290
-
291
- # Compute circular layout and set fixed positions
292
- pos = nx.circular_layout(viz_graph)
293
- for node_id, coords in pos.items():
294
- if node_id in viz_graph.nodes:
295
- x, y = coords
296
- viz_graph.nodes[node_id]['x'] = float(x) * 500
297
- viz_graph.nodes[node_id]['y'] = float(y) * 500
298
- viz_graph.nodes[node_id]['physics'] = False
299
-
300
- # Configure other options
301
- options = {
302
- "nodes": {
303
- "font": {"size": 12},
304
- "scaling": {"min": 10, "max": 30}
305
- },
306
- "edges": {
307
- "color": {"inherit": True},
308
- "smooth": {"enabled": True, "type": "dynamic"},
309
- "arrows": {"to": {"enabled": True, "scaleFactor": 0.5}},
310
- "font": {"size": 10, "align": "middle"}
311
- },
312
- "physics": physics_options,
313
- "interaction": {
314
- "hover": True,
315
- "navigationButtons": True,
316
- "keyboard": True,
317
- "tooltipDelay": 100
318
- }
319
- }
320
-
321
- # Set options and create the network
322
- net.options = options
323
- net.from_nx(viz_graph)
324
-
325
- # Add custom CSS for better visualization
326
- custom_css = """
327
- <style>
328
- .vis-network {
329
- border: 1px solid #ddd;
330
- border-radius: 5px;
331
- }
332
- .vis-tooltip {
333
- position: absolute;
334
- background-color: #f5f5f5;
335
- border: 1px solid #ccc;
336
- border-radius: 4px;
337
- padding: 10px;
338
- font-family: Arial, sans-serif;
339
- font-size: 12px;
340
- color: #333;
341
- max-width: 300px;
342
- z-index: 9999;
343
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
344
- }
345
- </style>
346
- """
347
-
348
- # Generate the HTML and add custom CSS
349
- html = net.generate_html()
350
- html = html.replace("<style>", custom_css + "<style>")
351
-
352
- # Add legend
353
- legend_html = self._generate_legend_html(viz_graph)
354
- html = html.replace("</body>", legend_html + "</body>")
355
-
356
- return html
357
-
358
- def _generate_legend_html(self, graph: nx.Graph) -> str:
359
- """Generate a legend for the visualization."""
360
- # Collect unique groups
361
- groups = set()
362
- for _, attrs in graph.nodes(data=True):
363
- if "group" in attrs:
364
- groups.add(attrs["group"])
365
-
366
- # Generate HTML for legend
367
- legend_html = """
368
- <div id="graph-legend" style="position: absolute; top: 10px; right: 10px; background-color: rgba(255,255,255,0.8);
369
- padding: 10px; border-radius: 5px; border: 1px solid #ddd; max-width: 200px;">
370
- <strong>Legend:</strong>
371
- <ul style="list-style-type: none; padding-left: 0; margin-top: 5px;">
372
- """
373
-
374
- # Add items for each group
375
- for group in sorted(groups):
376
- color = "#97c2fc" # Default color
377
- if group == "property":
378
- color = "#ffcc99"
379
- elif group == "class":
380
- color = "#a1d3a2"
381
-
382
- legend_html += f"""
383
- <li style="margin-bottom: 5px;">
384
- <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
385
- background-color: {color}; margin-right: 5px;"></span>
386
- {group}
387
- </li>
388
- """
389
-
390
- # Close the legend container
391
- legend_html += """
392
- </ul>
393
- <div style="font-size: 10px; margin-top: 5px; color: #666;">
394
- Double-click to zoom, drag to pan, scroll to zoom in/out
395
- </div>
396
- </div>
397
- """
398
-
399
- return legend_html
400
-
401
- def get_graph_statistics(self) -> Dict[str, Any]:
402
- """
403
- Calculate statistics about the knowledge graph.
404
-
405
- Returns:
406
- A dictionary containing graph statistics
407
- """
408
- if not self.graph:
409
- return {}
410
-
411
- # Count nodes by type
412
- class_count = 0
413
- instance_count = 0
414
- property_count = 0
415
-
416
- for _, data in self.graph.nodes(data=True):
417
- node_type = data.get("type")
418
- if node_type == "class":
419
- class_count += 1
420
- elif node_type == "instance":
421
- instance_count += 1
422
- if "properties" in data:
423
- property_count += len(data["properties"])
424
-
425
- # Count edges by type
426
- relationship_counts = {}
427
- for _, _, data in self.graph.edges(data=True):
428
- rel_type = data.get("type", "unknown")
429
- relationship_counts[rel_type] = relationship_counts.get(rel_type, 0) + 1
430
-
431
- # Calculate graph metrics
432
- try:
433
- # Some metrics only work on undirected graphs
434
- undirected = nx.Graph(self.graph)
435
- avg_degree = sum(dict(undirected.degree()).values()) / undirected.number_of_nodes()
436
-
437
- # Only calculate these if the graph is connected
438
- if nx.is_connected(undirected):
439
- avg_path_length = nx.average_shortest_path_length(undirected)
440
- diameter = nx.diameter(undirected)
441
- else:
442
- # Get the largest connected component
443
- largest_cc = max(nx.connected_components(undirected), key=len)
444
- largest_cc_subgraph = undirected.subgraph(largest_cc)
445
-
446
- avg_path_length = nx.average_shortest_path_length(largest_cc_subgraph)
447
- diameter = nx.diameter(largest_cc_subgraph)
448
-
449
- # Calculate density
450
- density = nx.density(self.graph)
451
-
452
- # Calculate clustering coefficient
453
- clustering = nx.average_clustering(undirected)
454
- except:
455
- avg_degree = 0
456
- avg_path_length = 0
457
- diameter = 0
458
- density = 0
459
- clustering = 0
460
-
461
- # Count different entity types
462
- class_counts = defaultdict(int)
463
- for _, data in self.graph.nodes(data=True):
464
- if data.get("type") == "instance":
465
- class_type = data.get("class_type", "unknown")
466
- class_counts[class_type] += 1
467
-
468
- # Get nodes with highest centrality
469
- try:
470
- betweenness = nx.betweenness_centrality(self.graph)
471
- degree = nx.degree_centrality(self.graph)
472
-
473
- # Get top 5 nodes by betweenness centrality
474
- top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]
475
- top_degree = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]
476
-
477
- central_nodes = {
478
- "betweenness": [{"node": node, "centrality": round(cent, 3)} for node, cent in top_betweenness],
479
- "degree": [{"node": node, "centrality": round(cent, 3)} for node, cent in top_degree]
480
- }
481
- except:
482
- central_nodes = {}
483
-
484
- return {
485
- "node_count": self.graph.number_of_nodes(),
486
- "edge_count": self.graph.number_of_edges(),
487
- "class_count": class_count,
488
- "instance_count": instance_count,
489
- "property_count": property_count,
490
- "relationship_counts": relationship_counts,
491
- "class_instance_counts": dict(class_counts),
492
- "average_degree": avg_degree,
493
- "average_path_length": avg_path_length,
494
- "diameter": diameter,
495
- "density": density,
496
- "clustering_coefficient": clustering,
497
- "central_nodes": central_nodes
498
- }
499
-
500
- def find_paths_between_entities(
501
- self,
502
- source_entity: str,
503
- target_entity: str,
504
- max_length: int = 3
505
- ) -> List[List[Dict]]:
506
- """
507
- Find all paths between two entities up to a maximum length.
508
-
509
- Args:
510
- source_entity: Starting entity ID
511
- target_entity: Target entity ID
512
- max_length: Maximum path length
513
-
514
- Returns:
515
- A list of paths, where each path is a list of edge dictionaries
516
- """
517
- if not self.graph or source_entity not in self.graph or target_entity not in self.graph:
518
- return []
519
-
520
- # Use networkx to find simple paths
521
- try:
522
- simple_paths = list(nx.all_simple_paths(
523
- self.graph, source_entity, target_entity, cutoff=max_length
524
- ))
525
- except (nx.NetworkXNoPath, nx.NodeNotFound):
526
- return []
527
-
528
- # Convert paths to edge sequences
529
- paths = []
530
- for path in simple_paths:
531
- edge_sequence = []
532
- for i in range(len(path) - 1):
533
- source = path[i]
534
- target = path[i + 1]
535
-
536
- # There may be multiple edges between nodes
537
- edges = self.graph.get_edge_data(source, target)
538
- if edges:
539
- for key, data in edges.items():
540
- edge_sequence.append({
541
- "source": source,
542
- "target": target,
543
- "type": data.get("type", "unknown")
544
- })
545
-
546
- # Only include the path if it has meaningful relationships
547
- # Filter out paths that only contain structural relationships like subClassOf, instanceOf
548
- meaningful_relationships = [edge for edge in edge_sequence
549
- if edge["type"] not in ["subClassOf", "instanceOf"]]
550
-
551
- if meaningful_relationships:
552
- paths.append(edge_sequence)
553
-
554
- # Sort paths by length (shorter paths first)
555
- paths.sort(key=len)
556
-
557
- return paths
558
-
559
- def get_entity_neighborhood(
560
- self,
561
- entity_id: str,
562
- max_distance: int = 1,
563
- include_classes: bool = True
564
- ) -> Dict[str, Any]:
565
- """
566
- Get the neighborhood of an entity.
567
-
568
- Args:
569
- entity_id: The central entity ID
570
- max_distance: Maximum distance from the central entity
571
- include_classes: Whether to include class relationships
572
-
573
- Returns:
574
- A dictionary containing the neighborhood information
575
- """
576
- if not self.graph or entity_id not in self.graph:
577
- return {}
578
-
579
- # Get nodes within max_distance of entity_id using BFS
580
- nodes_at_distance = {0: [entity_id]}
581
- visited = set([entity_id])
582
-
583
- for distance in range(1, max_distance + 1):
584
- nodes_at_distance[distance] = []
585
-
586
- for node in nodes_at_distance[distance - 1]:
587
- # Get neighbors
588
- neighbors = list(self.graph.successors(node)) + list(self.graph.predecessors(node))
589
-
590
- for neighbor in neighbors:
591
- # Skip class nodes if not including classes
592
- neighbor_data = self.graph.nodes.get(neighbor, {})
593
- if not include_classes and neighbor_data.get("type") == "class":
594
- continue
595
-
596
- if neighbor not in visited:
597
- nodes_at_distance[distance].append(neighbor)
598
- visited.add(neighbor)
599
-
600
- # Flatten the nodes
601
- all_nodes = [node for nodes in nodes_at_distance.values() for node in nodes]
602
-
603
- # Extract the subgraph
604
- subgraph = self.graph.subgraph(all_nodes)
605
-
606
- # Build neighbor information
607
- neighbors = []
608
- for node in all_nodes:
609
- if node == entity_id:
610
- continue
611
-
612
- node_data = self.graph.nodes[node]
613
-
614
- # Determine the relations to central entity
615
- relations = []
616
-
617
- # Check direct relationships
618
- # Check if central entity is source
619
- edges_out = self.graph.get_edge_data(entity_id, node)
620
- if edges_out:
621
- for key, data in edges_out.items():
622
- rel_type = data.get("type", "unknown")
623
-
624
- # Skip structural relationships if not including classes
625
- if not include_classes and rel_type in ["subClassOf", "instanceOf"]:
626
- continue
627
-
628
- relations.append({
629
- "type": rel_type,
630
- "direction": "outgoing"
631
- })
632
-
633
- # Check if central entity is target
634
- edges_in = self.graph.get_edge_data(node, entity_id)
635
- if edges_in:
636
- for key, data in edges_in.items():
637
- rel_type = data.get("type", "unknown")
638
-
639
- # Skip structural relationships if not including classes
640
- if not include_classes and rel_type in ["subClassOf", "instanceOf"]:
641
- continue
642
-
643
- relations.append({
644
- "type": rel_type,
645
- "direction": "incoming"
646
- })
647
-
648
- # Also find paths through intermediate nodes (indirect relationships)
649
- if not relations: # Only look for indirect if no direct relationships
650
- for path_length in range(2, max_distance + 1):
651
- try:
652
- # Find paths of exactly length path_length
653
- paths = list(nx.all_simple_paths(
654
- self.graph, entity_id, node, cutoff=path_length, min_edges=path_length
655
- ))
656
-
657
- for path in paths:
658
- if len(path) > 1: # Path should have at least 2 nodes
659
- intermediate_nodes = path[1:-1] # Skip source and target
660
-
661
- # Format the path as a relation
662
- path_relation = {
663
- "type": "indirect_connection",
664
- "direction": "outgoing",
665
- "path_length": len(path) - 1,
666
- "intermediates": intermediate_nodes
667
- }
668
-
669
- relations.append(path_relation)
670
-
671
- # Only need one example of an indirect path
672
- break
673
- except (nx.NetworkXNoPath, nx.NodeNotFound):
674
- pass
675
-
676
- # Only include neighbors with relations
677
- if relations:
678
- neighbors.append({
679
- "id": node,
680
- "type": node_data.get("type"),
681
- "class_type": node_data.get("class_type"),
682
- "properties": node_data.get("properties", {}),
683
- "relations": relations,
684
- "distance": next(dist for dist, nodes in nodes_at_distance.items() if node in nodes)
685
- })
686
-
687
- # Group neighbors by distance
688
- neighbors_by_distance = defaultdict(list)
689
- for neighbor in neighbors:
690
- neighbors_by_distance[neighbor["distance"]].append(neighbor)
691
-
692
- # Get central entity info
693
- central_data = self.graph.nodes[entity_id]
694
-
695
- return {
696
- "central_entity": {
697
- "id": entity_id,
698
- "type": central_data.get("type"),
699
- "class_type": central_data.get("class_type", ""),
700
- "properties": central_data.get("properties", {})
701
- },
702
- "neighbors": neighbors,
703
- "neighbors_by_distance": dict(neighbors_by_distance),
704
- "total_neighbors": len(neighbors)
705
- }
706
-
707
- def find_common_patterns(self) -> List[Dict[str, Any]]:
708
- """
709
- Find common patterns and structures in the knowledge graph.
710
-
711
- Returns:
712
- A list of pattern dictionaries
713
- """
714
- if not self.graph:
715
- return []
716
-
717
- patterns = []
718
-
719
- # Find common relationship patterns
720
- relationship_patterns = self._find_relationship_patterns()
721
- if relationship_patterns:
722
- patterns.extend(relationship_patterns)
723
-
724
- # Find hub entities (entities with many connections)
725
- hub_entities = self._find_hub_entities()
726
- if hub_entities:
727
- patterns.append({
728
- "type": "hub_entities",
729
- "description": "Entities with high connectivity serving as knowledge hubs",
730
- "entities": hub_entities
731
- })
732
-
733
- # Find common property patterns
734
- property_patterns = self._find_property_patterns()
735
- if property_patterns:
736
- patterns.extend(property_patterns)
737
-
738
- return patterns
739
-
740
- def _find_relationship_patterns(self) -> List[Dict[str, Any]]:
741
- """Find common relationship patterns in the graph."""
742
- # Count relationship triplets (source_type, relation, target_type)
743
- triplet_counts = defaultdict(int)
744
-
745
- for source, target, data in self.graph.edges(data=True):
746
- rel_type = data.get("type", "unknown")
747
-
748
- # Skip structural relationships
749
- if rel_type in ["subClassOf", "instanceOf"]:
750
- continue
751
-
752
- # Get node types
753
- source_data = self.graph.nodes[source]
754
- target_data = self.graph.nodes[target]
755
-
756
- source_type = (
757
- source_data.get("class_type")
758
- if source_data.get("type") == "instance"
759
- else source_data.get("type")
760
- )
761
-
762
- target_type = (
763
- target_data.get("class_type")
764
- if target_data.get("type") == "instance"
765
- else target_data.get("type")
766
- )
767
-
768
- if source_type and target_type:
769
- triplet = (source_type, rel_type, target_type)
770
- triplet_counts[triplet] += 1
771
-
772
- # Get patterns with significant frequency (more than 1 occurrence)
773
- patterns = []
774
- for triplet, count in triplet_counts.items():
775
- if count > 1:
776
- source_type, rel_type, target_type = triplet
777
-
778
- # Find examples of this pattern
779
- examples = []
780
- for source, target, data in self.graph.edges(data=True):
781
- if len(examples) >= 3: # Limit to 3 examples
782
- break
783
-
784
- rel = data.get("type", "unknown")
785
- if rel != rel_type:
786
- continue
787
-
788
- source_data = self.graph.nodes[source]
789
- target_data = self.graph.nodes[target]
790
-
791
- current_source_type = (
792
- source_data.get("class_type")
793
- if source_data.get("type") == "instance"
794
- else source_data.get("type")
795
- )
796
-
797
- current_target_type = (
798
- target_data.get("class_type")
799
- if target_data.get("type") == "instance"
800
- else target_data.get("type")
801
- )
802
-
803
- if current_source_type == source_type and current_target_type == target_type:
804
- # Get readable names if available
805
- source_name = source
806
- if source_data.get("type") == "instance" and "properties" in source_data:
807
- properties = source_data["properties"]
808
- if "name" in properties:
809
- source_name = properties["name"]
810
-
811
- target_name = target
812
- if target_data.get("type") == "instance" and "properties" in target_data:
813
- properties = target_data["properties"]
814
- if "name" in properties:
815
- target_name = properties["name"]
816
-
817
- examples.append({
818
- "source": source,
819
- "source_name": source_name,
820
- "target": target,
821
- "target_name": target_name,
822
- "relationship": rel_type
823
- })
824
-
825
- patterns.append({
826
- "type": "relationship_pattern",
827
- "description": f"{source_type} {rel_type} {target_type}",
828
- "source_type": source_type,
829
- "relationship": rel_type,
830
- "target_type": target_type,
831
- "count": count,
832
- "examples": examples
833
- })
834
-
835
- # Sort by frequency
836
- patterns.sort(key=lambda x: x["count"], reverse=True)
837
-
838
- return patterns
839
-
840
- def _find_hub_entities(self) -> List[Dict[str, Any]]:
841
- """Find entities that serve as hubs (many connections)."""
842
- # Calculate degree centrality
843
- degree = nx.degree_centrality(self.graph)
844
-
845
- # Get top entities by degree
846
- top_entities = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:10]
847
-
848
- hub_entities = []
849
- for node, centrality in top_entities:
850
- node_data = self.graph.nodes[node]
851
- node_type = node_data.get("type")
852
-
853
- # Only consider instance nodes
854
- if node_type == "instance":
855
- # Get class type
856
- class_type = node_data.get("class_type", "unknown")
857
-
858
- # Get name if available
859
- name = node
860
- if "properties" in node_data and "name" in node_data["properties"]:
861
- name = node_data["properties"]["name"]
862
-
863
- # Count relationships by type
864
- relationships = defaultdict(int)
865
- for _, _, data in self.graph.edges(data=True, nbunch=[node]):
866
- rel_type = data.get("type", "unknown")
867
- if rel_type not in ["subClassOf", "instanceOf"]:
868
- relationships[rel_type] += 1
869
-
870
- hub_entities.append({
871
- "id": node,
872
- "name": name,
873
- "type": class_type,
874
- "centrality": centrality,
875
- "relationships": dict(relationships),
876
- "total_connections": sum(relationships.values())
877
- })
878
-
879
- # Sort by total connections
880
- hub_entities.sort(key=lambda x: x["total_connections"], reverse=True)
881
-
882
- return hub_entities
883
-
884
- def _find_property_patterns(self) -> List[Dict[str, Any]]:
885
- """Find common property patterns in instance data."""
886
- # Track properties by class type
887
- properties_by_class = defaultdict(lambda: defaultdict(int))
888
-
889
- for node, data in self.graph.nodes(data=True):
890
- if data.get("type") == "instance":
891
- class_type = data.get("class_type", "unknown")
892
-
893
- if "properties" in data:
894
- for prop in data["properties"].keys():
895
- properties_by_class[class_type][prop] += 1
896
-
897
- # Find common property combinations
898
- patterns = []
899
- for class_type, props in properties_by_class.items():
900
- # Sort properties by frequency
901
- sorted_props = sorted(props.items(), key=lambda x: x[1], reverse=True)
902
-
903
- # Only include classes with multiple instances
904
- class_instances = sum(1 for _, data in self.graph.nodes(data=True)
905
- if data.get("type") == "instance" and data.get("class_type") == class_type)
906
-
907
- if class_instances > 1:
908
- common_props = [prop for prop, count in sorted_props if count > 1]
909
-
910
- if common_props:
911
- patterns.append({
912
- "type": "property_pattern",
913
- "description": f"Common properties for {class_type} instances",
914
- "class_type": class_type,
915
- "instance_count": class_instances,
916
- "common_properties": common_props,
917
- "property_frequencies": {prop: count for prop, count in sorted_props}
918
- })
919
-
920
  return patterns
 
1
+ # src/knowledge_graph.py
2
+
3
+ import networkx as nx
4
+ from pyvis.network import Network
5
+ import json
6
+ from typing import Dict, List, Any, Optional, Set, Tuple
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib.colors as mcolors
9
+ from collections import defaultdict
10
+
11
+ class KnowledgeGraph:
12
+ """
13
+ Handles the construction and visualization of knowledge graphs
14
+ based on the ontology data.
15
+ """
16
+
17
+ def __init__(self, ontology_manager=None):
18
+ """
19
+ Initialize the knowledge graph handler.
20
+
21
+ Args:
22
+ ontology_manager: Optional ontology manager instance
23
+ """
24
+ self.ontology_manager = ontology_manager
25
+ self.graph = None
26
+
27
+ if ontology_manager:
28
+ self.graph = ontology_manager.graph
29
+
30
+ def build_visualization_graph(
31
+ self,
32
+ include_classes: bool = True,
33
+ include_instances: bool = True,
34
+ central_entity: Optional[str] = None,
35
+ max_distance: int = 2,
36
+ include_properties: bool = False
37
+ ) -> nx.Graph:
38
+ """
39
+ Build a simplified graph for visualization purposes.
40
+
41
+ Args:
42
+ include_classes: Whether to include class nodes
43
+ include_instances: Whether to include instance nodes
44
+ central_entity: Optional central entity to focus the graph on
45
+ max_distance: Maximum distance from central entity to include
46
+ include_properties: Whether to include property nodes
47
+
48
+ Returns:
49
+ A NetworkX graph suitable for visualization
50
+ """
51
+ if not self.graph:
52
+ return nx.Graph()
53
+
54
+ # Create an undirected graph for visualization
55
+ viz_graph = nx.Graph()
56
+
57
+ # If we have a central entity, extract a subgraph around it
58
+ if central_entity and central_entity in self.graph:
59
+ # Get nodes within max_distance of central_entity
60
+ nodes_to_include = set([central_entity])
61
+ current_distance = 0
62
+ current_layer = set([central_entity])
63
+
64
+ while current_distance < max_distance:
65
+ next_layer = set()
66
+ for node in current_layer:
67
+ # Get neighbors
68
+ neighbors = set(self.graph.successors(node)).union(set(self.graph.predecessors(node)))
69
+ next_layer.update(neighbors)
70
+
71
+ nodes_to_include.update(next_layer)
72
+ current_layer = next_layer
73
+ current_distance += 1
74
+
75
+ # Create subgraph
76
+ subgraph = self.graph.subgraph(nodes_to_include)
77
+ else:
78
+ subgraph = self.graph
79
+
80
+ # Add nodes to the visualization graph
81
+ for node, data in subgraph.nodes(data=True):
82
+ node_type = data.get("type")
83
+
84
+ # Skip nodes based on configuration
85
+ if node_type == "class" and not include_classes:
86
+ continue
87
+ if node_type == "instance" and not include_instances:
88
+ continue
89
+
90
+ # Get readable name for the node
91
+ if node_type == "instance" and "properties" in data:
92
+ label = data["properties"].get("name", node)
93
+ else:
94
+ label = node
95
+
96
+ # Set node attributes for visualization
97
+ viz_attrs = {
98
+ "id": node,
99
+ "label": label,
100
+ "title": self._get_node_tooltip(node, data),
101
+ "group": data.get("class_type", node_type),
102
+ "shape": "dot" if node_type == "instance" else "diamond"
103
+ }
104
+
105
+ # Highlight central entity if specified
106
+ if central_entity and node == central_entity:
107
+ viz_attrs["color"] = "#ff7f0e" # Orange for central entity
108
+ viz_attrs["size"] = 25 # Larger size for central entity
109
+
110
+ # Add the node
111
+ viz_graph.add_node(node, **viz_attrs)
112
+
113
+ # Add property nodes if configured
114
+ if include_properties and node_type == "instance" and "properties" in data:
115
+ for prop_name, prop_value in data["properties"].items():
116
+ # Create a property node
117
+ prop_node_id = f"{node}_{prop_name}"
118
+ prop_value_str = str(prop_value)
119
+ if len(prop_value_str) > 20:
120
+ prop_value_str = prop_value_str[:17] + "..."
121
+
122
+ viz_graph.add_node(
123
+ prop_node_id,
124
+ id=prop_node_id,
125
+ label=f"{prop_name}: {prop_value_str}",
126
+ title=f"{prop_name}: {prop_value}",
127
+ group="property",
128
+ shape="ellipse",
129
+ size=5
130
+ )
131
+
132
+ # Connect instance to property
133
+ viz_graph.add_edge(node, prop_node_id, label="has_property", dashes=True)
134
+
135
+ # Add edges to the visualization graph
136
+ for source, target, data in subgraph.edges(data=True):
137
+ # Only include edges between nodes that are in the viz_graph
138
+ if source in viz_graph and target in viz_graph:
139
+ # Skip property-related edges if we're manually creating them
140
+ if include_properties and (
141
+ source.startswith(target + "_") or target.startswith(source + "_")
142
+ ):
143
+ continue
144
+
145
+ # Set edge attributes
146
+ edge_type = data.get("type", "unknown")
147
+
148
+ # Don't show subClassOf and instanceOf relationships if not explicitly requested
149
+ if edge_type in ["subClassOf", "instanceOf"] and not include_classes:
150
+ continue
151
+
152
+ viz_graph.add_edge(source, target, label=edge_type, title=edge_type)
153
+
154
+ return viz_graph
155
+
156
+ def _get_node_tooltip(self, node_id: str, data: Dict) -> str:
157
+ """Generate a tooltip for a node."""
158
+ tooltip = f"<strong>ID:</strong> {node_id}<br>"
159
+
160
+ node_type = data.get("type")
161
+ if node_type:
162
+ tooltip += f"<strong>Type:</strong> {node_type}<br>"
163
+
164
+ if node_type == "instance":
165
+ tooltip += f"<strong>Class:</strong> {data.get('class_type', 'unknown')}<br>"
166
+
167
+ # Add properties
168
+ if "properties" in data:
169
+ tooltip += "<strong>Properties:</strong><br>"
170
+ for key, value in data["properties"].items():
171
+ tooltip += f"- {key}: {value}<br>"
172
+
173
+ elif node_type == "class":
174
+ tooltip += f"<strong>Description:</strong> {data.get('description', '')}<br>"
175
+
176
+ # Add properties if available
177
+ if "properties" in data:
178
+ tooltip += "<strong>Properties:</strong> " + ", ".join(data["properties"]) + "<br>"
179
+
180
+ return tooltip
181
+
182
+ def generate_html_visualization(
183
+ self,
184
+ include_classes: bool = True,
185
+ include_instances: bool = True,
186
+ central_entity: Optional[str] = None,
187
+ max_distance: int = 2,
188
+ include_properties: bool = False,
189
+ height: str = "600px",
190
+ width: str = "100%",
191
+ bgcolor: str = "#ffffff",
192
+ font_color: str = "#000000",
193
+ layout_algorithm: str = "force-directed"
194
+ ) -> str:
195
+ """
196
+ Generate an HTML visualization of the knowledge graph.
197
+
198
+ Args:
199
+ include_classes: Whether to include class nodes
200
+ include_instances: Whether to include instance nodes
201
+ central_entity: Optional central entity to focus the graph on
202
+ max_distance: Maximum distance from central entity to include
203
+ include_properties: Whether to include property nodes
204
+ height: Height of the visualization
205
+ width: Width of the visualization
206
+ bgcolor: Background color
207
+ font_color: Font color
208
+ layout_algorithm: Algorithm for layout ('force-directed', 'hierarchical', 'radial', 'circular')
209
+
210
+ Returns:
211
+ HTML string containing the visualization
212
+ """
213
+ # Build the visualization graph
214
+ viz_graph = self.build_visualization_graph(
215
+ include_classes=include_classes,
216
+ include_instances=include_instances,
217
+ central_entity=central_entity,
218
+ max_distance=max_distance,
219
+ include_properties=include_properties
220
+ )
221
+
222
+ # Create a PyVis network
223
+ net = Network(height=height, width=width, bgcolor=bgcolor, font_color=font_color, directed=True)
224
+
225
+ # Configure physics based on the selected layout algorithm
226
+ if layout_algorithm == "force-directed":
227
+ physics_options = {
228
+ "enabled": True,
229
+ "solver": "forceAtlas2Based",
230
+ "forceAtlas2Based": {
231
+ "gravitationalConstant": -50,
232
+ "centralGravity": 0.01,
233
+ "springLength": 100,
234
+ "springConstant": 0.08
235
+ },
236
+ "stabilization": {
237
+ "enabled": True,
238
+ "iterations": 100
239
+ }
240
+ }
241
+ elif layout_algorithm == "hierarchical":
242
+ physics_options = {
243
+ "enabled": True,
244
+ "hierarchicalRepulsion": {
245
+ "centralGravity": 0.0,
246
+ "springLength": 100,
247
+ "springConstant": 0.01,
248
+ "nodeDistance": 120
249
+ },
250
+ "solver": "hierarchicalRepulsion",
251
+ "stabilization": {
252
+ "enabled": True,
253
+ "iterations": 100
254
+ }
255
+ }
256
+
257
+ # Set hierarchical layout
258
+ net.set_options("""
259
+ var options = {
260
+ "layout": {
261
+ "hierarchical": {
262
+ "enabled": true,
263
+ "direction": "UD",
264
+ "sortMethod": "directed",
265
+ "nodeSpacing": 150,
266
+ "treeSpacing": 200
267
+ }
268
+ }
269
+ }
270
+ """)
271
+ elif layout_algorithm == "radial":
272
+ physics_options = {
273
+ "enabled": True,
274
+ "solver": "repulsion",
275
+ "repulsion": {
276
+ "nodeDistance": 120,
277
+ "centralGravity": 0.2,
278
+ "springLength": 200,
279
+ "springConstant": 0.05
280
+ },
281
+ "stabilization": {
282
+ "enabled": True,
283
+ "iterations": 100
284
+ }
285
+ }
286
+ elif layout_algorithm == "circular":
287
+ physics_options = {
288
+ "enabled": False
289
+ }
290
+
291
+ # Compute circular layout and set fixed positions
292
+ pos = nx.circular_layout(viz_graph)
293
+ for node_id, coords in pos.items():
294
+ if node_id in viz_graph.nodes:
295
+ x, y = coords
296
+ viz_graph.nodes[node_id]['x'] = float(x) * 500
297
+ viz_graph.nodes[node_id]['y'] = float(y) * 500
298
+ viz_graph.nodes[node_id]['physics'] = False
299
+
300
+ # Configure other options
301
+ options = {
302
+ "nodes": {
303
+ "font": {"size": 12},
304
+ "scaling": {"min": 10, "max": 30}
305
+ },
306
+ "edges": {
307
+ "color": {"inherit": True},
308
+ "smooth": {"enabled": True, "type": "dynamic"},
309
+ "arrows": {"to": {"enabled": True, "scaleFactor": 0.5}},
310
+ "font": {"size": 10, "align": "middle"}
311
+ },
312
+ "physics": physics_options,
313
+ "interaction": {
314
+ "hover": True,
315
+ "navigationButtons": True,
316
+ "keyboard": True,
317
+ "tooltipDelay": 100
318
+ }
319
+ }
320
+
321
+ # Set options and create the network
322
+ net.options = options
323
+ net.from_nx(viz_graph)
324
+
325
+ # Add custom CSS for better visualization
326
+ custom_css = """
327
+ <style>
328
+ .vis-network {
329
+ border: 1px solid #ddd;
330
+ border-radius: 5px;
331
+ }
332
+ .vis-tooltip {
333
+ position: absolute;
334
+ background-color: #f5f5f5;
335
+ border: 1px solid #ccc;
336
+ border-radius: 4px;
337
+ padding: 10px;
338
+ font-family: Arial, sans-serif;
339
+ font-size: 12px;
340
+ color: #333;
341
+ max-width: 300px;
342
+ z-index: 9999;
343
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
344
+ }
345
+ </style>
346
+ """
347
+
348
+ # Generate the HTML and add custom CSS
349
+ html = net.generate_html()
350
+ html = html.replace("<style>", custom_css + "<style>")
351
+
352
+ # Add legend
353
+ legend_html = self._generate_legend_html(viz_graph)
354
+ html = html.replace("</body>", legend_html + "</body>")
355
+
356
+ return html
357
+
358
+ def _generate_legend_html(self, graph: nx.Graph) -> str:
359
+ """Generate a legend for the visualization."""
360
+ # Collect unique groups
361
+ groups = set()
362
+ for _, attrs in graph.nodes(data=True):
363
+ if "group" in attrs:
364
+ groups.add(attrs["group"])
365
+
366
+ # Generate HTML for legend
367
+ legend_html = """
368
+ <div id="graph-legend" style="position: absolute; top: 10px; right: 10px; background-color: rgba(255,255,255,0.8);
369
+ padding: 10px; border-radius: 5px; border: 1px solid #ddd; max-width: 200px;">
370
+ <strong>Legend:</strong>
371
+ <ul style="list-style-type: none; padding-left: 0; margin-top: 5px;">
372
+ """
373
+
374
+ # Add items for each group
375
+ for group in sorted(groups):
376
+ color = "#97c2fc" # Default color
377
+ if group == "property":
378
+ color = "#ffcc99"
379
+ elif group == "class":
380
+ color = "#a1d3a2"
381
+
382
+ legend_html += f"""
383
+ <li style="margin-bottom: 5px;">
384
+ <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
385
+ background-color: {color}; margin-right: 5px;"></span>
386
+ {group}
387
+ </li>
388
+ """
389
+
390
+ # Close the legend container
391
+ legend_html += """
392
+ </ul>
393
+ <div style="font-size: 10px; margin-top: 5px; color: #666;">
394
+ Double-click to zoom, drag to pan, scroll to zoom in/out
395
+ </div>
396
+ </div>
397
+ """
398
+
399
+ return legend_html
400
+
401
+ def get_graph_statistics(self) -> Dict[str, Any]:
402
+ """
403
+ Calculate statistics about the knowledge graph.
404
+
405
+ Returns:
406
+ A dictionary containing graph statistics
407
+ """
408
+ if not self.graph:
409
+ return {}
410
+
411
+ # Count nodes by type
412
+ class_count = 0
413
+ instance_count = 0
414
+ property_count = 0
415
+
416
+ for _, data in self.graph.nodes(data=True):
417
+ node_type = data.get("type")
418
+ if node_type == "class":
419
+ class_count += 1
420
+ elif node_type == "instance":
421
+ instance_count += 1
422
+ if "properties" in data:
423
+ property_count += len(data["properties"])
424
+
425
+ # Count edges by type
426
+ relationship_counts = {}
427
+ for _, _, data in self.graph.edges(data=True):
428
+ rel_type = data.get("type", "unknown")
429
+ relationship_counts[rel_type] = relationship_counts.get(rel_type, 0) + 1
430
+
431
+ # Calculate graph metrics
432
+ try:
433
+ # Some metrics only work on undirected graphs
434
+ undirected = nx.Graph(self.graph)
435
+ avg_degree = sum(dict(undirected.degree()).values()) / undirected.number_of_nodes()
436
+
437
+ # Only calculate these if the graph is connected
438
+ if nx.is_connected(undirected):
439
+ avg_path_length = nx.average_shortest_path_length(undirected)
440
+ diameter = nx.diameter(undirected)
441
+ else:
442
+ # Get the largest connected component
443
+ largest_cc = max(nx.connected_components(undirected), key=len)
444
+ largest_cc_subgraph = undirected.subgraph(largest_cc)
445
+
446
+ avg_path_length = nx.average_shortest_path_length(largest_cc_subgraph)
447
+ diameter = nx.diameter(largest_cc_subgraph)
448
+
449
+ # Calculate density
450
+ density = nx.density(self.graph)
451
+
452
+ # Calculate clustering coefficient
453
+ clustering = nx.average_clustering(undirected)
454
+ except:
455
+ avg_degree = 0
456
+ avg_path_length = 0
457
+ diameter = 0
458
+ density = 0
459
+ clustering = 0
460
+
461
+ # Count different entity types
462
+ class_counts = defaultdict(int)
463
+ for _, data in self.graph.nodes(data=True):
464
+ if data.get("type") == "instance":
465
+ class_type = data.get("class_type", "unknown")
466
+ class_counts[class_type] += 1
467
+
468
+ # Get nodes with highest centrality
469
+ try:
470
+ betweenness = nx.betweenness_centrality(self.graph)
471
+ degree = nx.degree_centrality(self.graph)
472
+
473
+ # Get top 5 nodes by betweenness centrality
474
+ top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]
475
+ top_degree = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]
476
+
477
+ central_nodes = {
478
+ "betweenness": [{"node": node, "centrality": round(cent, 3)} for node, cent in top_betweenness],
479
+ "degree": [{"node": node, "centrality": round(cent, 3)} for node, cent in top_degree]
480
+ }
481
+ except:
482
+ central_nodes = {}
483
+
484
+ return {
485
+ "node_count": self.graph.number_of_nodes(),
486
+ "edge_count": self.graph.number_of_edges(),
487
+ "class_count": class_count,
488
+ "instance_count": instance_count,
489
+ "property_count": property_count,
490
+ "relationship_counts": relationship_counts,
491
+ "class_instance_counts": dict(class_counts),
492
+ "average_degree": avg_degree,
493
+ "average_path_length": avg_path_length,
494
+ "diameter": diameter,
495
+ "density": density,
496
+ "clustering_coefficient": clustering,
497
+ "central_nodes": central_nodes
498
+ }
499
+
500
+ def find_paths_between_entities(
501
+ self,
502
+ source_entity: str,
503
+ target_entity: str,
504
+ max_length: int = 3
505
+ ) -> List[List[Dict]]:
506
+ """
507
+ Find all paths between two entities up to a maximum length.
508
+
509
+ Args:
510
+ source_entity: Starting entity ID
511
+ target_entity: Target entity ID
512
+ max_length: Maximum path length
513
+
514
+ Returns:
515
+ A list of paths, where each path is a list of edge dictionaries
516
+ """
517
+ if not self.graph or source_entity not in self.graph or target_entity not in self.graph:
518
+ return []
519
+
520
+ # Use networkx to find simple paths
521
+ try:
522
+ simple_paths = list(nx.all_simple_paths(
523
+ self.graph, source_entity, target_entity, cutoff=max_length
524
+ ))
525
+ except (nx.NetworkXNoPath, nx.NodeNotFound):
526
+ return []
527
+
528
+ # Convert paths to edge sequences
529
+ paths = []
530
+ for path in simple_paths:
531
+ edge_sequence = []
532
+ for i in range(len(path) - 1):
533
+ source = path[i]
534
+ target = path[i + 1]
535
+
536
+ # There may be multiple edges between nodes
537
+ edges = self.graph.get_edge_data(source, target)
538
+ if edges:
539
+ for key, data in edges.items():
540
+ edge_sequence.append({
541
+ "source": source,
542
+ "target": target,
543
+ "type": data.get("type", "unknown")
544
+ })
545
+
546
+ # Only include the path if it has meaningful relationships
547
+ # Filter out paths that only contain structural relationships like subClassOf, instanceOf
548
+ meaningful_relationships = [edge for edge in edge_sequence
549
+ if edge["type"] not in ["subClassOf", "instanceOf"]]
550
+
551
+ if meaningful_relationships:
552
+ paths.append(edge_sequence)
553
+
554
+ # Sort paths by length (shorter paths first)
555
+ paths.sort(key=len)
556
+
557
+ return paths
558
+
559
+ def get_entity_neighborhood(
560
+ self,
561
+ entity_id: str,
562
+ max_distance: int = 1,
563
+ include_classes: bool = True
564
+ ) -> Dict[str, Any]:
565
+ """
566
+ Get the neighborhood of an entity.
567
+
568
+ Args:
569
+ entity_id: The central entity ID
570
+ max_distance: Maximum distance from the central entity
571
+ include_classes: Whether to include class relationships
572
+
573
+ Returns:
574
+ A dictionary containing the neighborhood information
575
+ """
576
+ if not self.graph or entity_id not in self.graph:
577
+ return {}
578
+
579
+ # Get nodes within max_distance of entity_id using BFS
580
+ nodes_at_distance = {0: [entity_id]}
581
+ visited = set([entity_id])
582
+
583
+ for distance in range(1, max_distance + 1):
584
+ nodes_at_distance[distance] = []
585
+
586
+ for node in nodes_at_distance[distance - 1]:
587
+ # Get neighbors
588
+ neighbors = list(self.graph.successors(node)) + list(self.graph.predecessors(node))
589
+
590
+ for neighbor in neighbors:
591
+ # Skip class nodes if not including classes
592
+ neighbor_data = self.graph.nodes.get(neighbor, {})
593
+ if not include_classes and neighbor_data.get("type") == "class":
594
+ continue
595
+
596
+ if neighbor not in visited:
597
+ nodes_at_distance[distance].append(neighbor)
598
+ visited.add(neighbor)
599
+
600
+ # Flatten the nodes
601
+ all_nodes = [node for nodes in nodes_at_distance.values() for node in nodes]
602
+
603
+ # Extract the subgraph
604
+ subgraph = self.graph.subgraph(all_nodes)
605
+
606
+ # Build neighbor information
607
+ neighbors = []
608
+ for node in all_nodes:
609
+ if node == entity_id:
610
+ continue
611
+
612
+ node_data = self.graph.nodes[node]
613
+
614
+ # Determine the relations to central entity
615
+ relations = []
616
+
617
+ # Check direct relationships
618
+ # Check if central entity is source
619
+ edges_out = self.graph.get_edge_data(entity_id, node)
620
+ if edges_out:
621
+ for key, data in edges_out.items():
622
+ rel_type = data.get("type", "unknown")
623
+
624
+ # Skip structural relationships if not including classes
625
+ if not include_classes and rel_type in ["subClassOf", "instanceOf"]:
626
+ continue
627
+
628
+ relations.append({
629
+ "type": rel_type,
630
+ "direction": "outgoing"
631
+ })
632
+
633
+ # Check if central entity is target
634
+ edges_in = self.graph.get_edge_data(node, entity_id)
635
+ if edges_in:
636
+ for key, data in edges_in.items():
637
+ rel_type = data.get("type", "unknown")
638
+
639
+ # Skip structural relationships if not including classes
640
+ if not include_classes and rel_type in ["subClassOf", "instanceOf"]:
641
+ continue
642
+
643
+ relations.append({
644
+ "type": rel_type,
645
+ "direction": "incoming"
646
+ })
647
+
648
+ # Also find paths through intermediate nodes (indirect relationships)
649
+ if not relations: # Only look for indirect if no direct relationships
650
+ for path_length in range(2, max_distance + 1):
651
+ try:
652
+ # Find paths of exactly length path_length
653
+ paths = list(nx.all_simple_paths(
654
+ self.graph, entity_id, node, cutoff=path_length, min_edges=path_length
655
+ ))
656
+
657
+ for path in paths:
658
+ if len(path) > 1: # Path should have at least 2 nodes
659
+ intermediate_nodes = path[1:-1] # Skip source and target
660
+
661
+ # Format the path as a relation
662
+ path_relation = {
663
+ "type": "indirect_connection",
664
+ "direction": "outgoing",
665
+ "path_length": len(path) - 1,
666
+ "intermediates": intermediate_nodes
667
+ }
668
+
669
+ relations.append(path_relation)
670
+
671
+ # Only need one example of an indirect path
672
+ break
673
+ except (nx.NetworkXNoPath, nx.NodeNotFound):
674
+ pass
675
+
676
+ # Only include neighbors with relations
677
+ if relations:
678
+ neighbors.append({
679
+ "id": node,
680
+ "type": node_data.get("type"),
681
+ "class_type": node_data.get("class_type"),
682
+ "properties": node_data.get("properties", {}),
683
+ "relations": relations,
684
+ "distance": next(dist for dist, nodes in nodes_at_distance.items() if node in nodes)
685
+ })
686
+
687
+ # Group neighbors by distance
688
+ neighbors_by_distance = defaultdict(list)
689
+ for neighbor in neighbors:
690
+ neighbors_by_distance[neighbor["distance"]].append(neighbor)
691
+
692
+ # Get central entity info
693
+ central_data = self.graph.nodes[entity_id]
694
+
695
+ return {
696
+ "central_entity": {
697
+ "id": entity_id,
698
+ "type": central_data.get("type"),
699
+ "class_type": central_data.get("class_type", ""),
700
+ "properties": central_data.get("properties", {})
701
+ },
702
+ "neighbors": neighbors,
703
+ "neighbors_by_distance": dict(neighbors_by_distance),
704
+ "total_neighbors": len(neighbors)
705
+ }
706
+
707
+ def find_common_patterns(self) -> List[Dict[str, Any]]:
708
+ """
709
+ Find common patterns and structures in the knowledge graph.
710
+
711
+ Returns:
712
+ A list of pattern dictionaries
713
+ """
714
+ if not self.graph:
715
+ return []
716
+
717
+ patterns = []
718
+
719
+ # Find common relationship patterns
720
+ relationship_patterns = self._find_relationship_patterns()
721
+ if relationship_patterns:
722
+ patterns.extend(relationship_patterns)
723
+
724
+ # Find hub entities (entities with many connections)
725
+ hub_entities = self._find_hub_entities()
726
+ if hub_entities:
727
+ patterns.append({
728
+ "type": "hub_entities",
729
+ "description": "Entities with high connectivity serving as knowledge hubs",
730
+ "entities": hub_entities
731
+ })
732
+
733
+ # Find common property patterns
734
+ property_patterns = self._find_property_patterns()
735
+ if property_patterns:
736
+ patterns.extend(property_patterns)
737
+
738
+ return patterns
739
+
740
+ def _find_relationship_patterns(self) -> List[Dict[str, Any]]:
741
+ """Find common relationship patterns in the graph."""
742
+ # Count relationship triplets (source_type, relation, target_type)
743
+ triplet_counts = defaultdict(int)
744
+
745
+ for source, target, data in self.graph.edges(data=True):
746
+ rel_type = data.get("type", "unknown")
747
+
748
+ # Skip structural relationships
749
+ if rel_type in ["subClassOf", "instanceOf"]:
750
+ continue
751
+
752
+ # Get node types
753
+ source_data = self.graph.nodes[source]
754
+ target_data = self.graph.nodes[target]
755
+
756
+ source_type = (
757
+ source_data.get("class_type")
758
+ if source_data.get("type") == "instance"
759
+ else source_data.get("type")
760
+ )
761
+
762
+ target_type = (
763
+ target_data.get("class_type")
764
+ if target_data.get("type") == "instance"
765
+ else target_data.get("type")
766
+ )
767
+
768
+ if source_type and target_type:
769
+ triplet = (source_type, rel_type, target_type)
770
+ triplet_counts[triplet] += 1
771
+
772
+ # Get patterns with significant frequency (more than 1 occurrence)
773
+ patterns = []
774
+ for triplet, count in triplet_counts.items():
775
+ if count > 1:
776
+ source_type, rel_type, target_type = triplet
777
+
778
+ # Find examples of this pattern
779
+ examples = []
780
+ for source, target, data in self.graph.edges(data=True):
781
+ if len(examples) >= 3: # Limit to 3 examples
782
+ break
783
+
784
+ rel = data.get("type", "unknown")
785
+ if rel != rel_type:
786
+ continue
787
+
788
+ source_data = self.graph.nodes[source]
789
+ target_data = self.graph.nodes[target]
790
+
791
+ current_source_type = (
792
+ source_data.get("class_type")
793
+ if source_data.get("type") == "instance"
794
+ else source_data.get("type")
795
+ )
796
+
797
+ current_target_type = (
798
+ target_data.get("class_type")
799
+ if target_data.get("type") == "instance"
800
+ else target_data.get("type")
801
+ )
802
+
803
+ if current_source_type == source_type and current_target_type == target_type:
804
+ # Get readable names if available
805
+ source_name = source
806
+ if source_data.get("type") == "instance" and "properties" in source_data:
807
+ properties = source_data["properties"]
808
+ if "name" in properties:
809
+ source_name = properties["name"]
810
+
811
+ target_name = target
812
+ if target_data.get("type") == "instance" and "properties" in target_data:
813
+ properties = target_data["properties"]
814
+ if "name" in properties:
815
+ target_name = properties["name"]
816
+
817
+ examples.append({
818
+ "source": source,
819
+ "source_name": source_name,
820
+ "target": target,
821
+ "target_name": target_name,
822
+ "relationship": rel_type
823
+ })
824
+
825
+ patterns.append({
826
+ "type": "relationship_pattern",
827
+ "description": f"{source_type} {rel_type} {target_type}",
828
+ "source_type": source_type,
829
+ "relationship": rel_type,
830
+ "target_type": target_type,
831
+ "count": count,
832
+ "examples": examples
833
+ })
834
+
835
+ # Sort by frequency
836
+ patterns.sort(key=lambda x: x["count"], reverse=True)
837
+
838
+ return patterns
839
+
840
+ def _find_hub_entities(self) -> List[Dict[str, Any]]:
841
+ """Find entities that serve as hubs (many connections)."""
842
+ # Calculate degree centrality
843
+ degree = nx.degree_centrality(self.graph)
844
+
845
+ # Get top entities by degree
846
+ top_entities = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:10]
847
+
848
+ hub_entities = []
849
+ for node, centrality in top_entities:
850
+ node_data = self.graph.nodes[node]
851
+ node_type = node_data.get("type")
852
+
853
+ # Only consider instance nodes
854
+ if node_type == "instance":
855
+ # Get class type
856
+ class_type = node_data.get("class_type", "unknown")
857
+
858
+ # Get name if available
859
+ name = node
860
+ if "properties" in node_data and "name" in node_data["properties"]:
861
+ name = node_data["properties"]["name"]
862
+
863
+ # Count relationships by type
864
+ relationships = defaultdict(int)
865
+ for _, _, data in self.graph.edges(data=True, nbunch=[node]):
866
+ rel_type = data.get("type", "unknown")
867
+ if rel_type not in ["subClassOf", "instanceOf"]:
868
+ relationships[rel_type] += 1
869
+
870
+ hub_entities.append({
871
+ "id": node,
872
+ "name": name,
873
+ "type": class_type,
874
+ "centrality": centrality,
875
+ "relationships": dict(relationships),
876
+ "total_connections": sum(relationships.values())
877
+ })
878
+
879
+ # Sort by total connections
880
+ hub_entities.sort(key=lambda x: x["total_connections"], reverse=True)
881
+
882
+ return hub_entities
883
+
884
+ def _find_property_patterns(self) -> List[Dict[str, Any]]:
885
+ """Find common property patterns in instance data."""
886
+ # Track properties by class type
887
+ properties_by_class = defaultdict(lambda: defaultdict(int))
888
+
889
+ for node, data in self.graph.nodes(data=True):
890
+ if data.get("type") == "instance":
891
+ class_type = data.get("class_type", "unknown")
892
+
893
+ if "properties" in data:
894
+ for prop in data["properties"].keys():
895
+ properties_by_class[class_type][prop] += 1
896
+
897
+ # Find common property combinations
898
+ patterns = []
899
+ for class_type, props in properties_by_class.items():
900
+ # Sort properties by frequency
901
+ sorted_props = sorted(props.items(), key=lambda x: x[1], reverse=True)
902
+
903
+ # Only include classes with multiple instances
904
+ class_instances = sum(1 for _, data in self.graph.nodes(data=True)
905
+ if data.get("type") == "instance" and data.get("class_type") == class_type)
906
+
907
+ if class_instances > 1:
908
+ common_props = [prop for prop, count in sorted_props if count > 1]
909
+
910
+ if common_props:
911
+ patterns.append({
912
+ "type": "property_pattern",
913
+ "description": f"Common properties for {class_type} instances",
914
+ "class_type": class_type,
915
+ "instance_count": class_instances,
916
+ "common_properties": common_props,
917
+ "property_frequencies": {prop: count for prop, count in sorted_props}
918
+ })
919
+
920
  return patterns