import pandas as pd import requests from bs4 import BeautifulSoup import re from common import custom_feature_extraction_markuplm def split_sliding_data(items, window_size, overlap): new_data = [] for obj in items: nodes = obj['nodes'] num_elements = len(nodes) counter = 0 for i in range(0, num_elements, window_size - overlap): start = i end = min(i + window_size, num_elements) #print (start, end) new_obj = { 'Index': obj['Index'] if 'Index' in obj else 0, 'Url': obj['Url'] if 'Url' in obj else None, 'Path': obj['Path'] if 'Path' in obj else None, 'nodes': obj['nodes'][start:end], 'xpaths': obj['xpaths'][start:end], 'labels': obj['labels'][start:end] if 'labels' in obj else None, } counter= counter+1 #print (new_obj, '\n') new_data.append(new_obj) return new_data # Function to fetch HTML content from URL def get_html_content(url): try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except Exception as e: print("Error fetching HTML content:", e) return None # Function to clean HTML content def clean_html(html): # Remove extra whitespaces, newlines, and tabs soup = BeautifulSoup(html, "html.parser") for data in soup(['style', 'script',]): # Remove tags data.decompose() html = str(soup) clean_html = re.sub(r'\s+', ' ', html) # Escape double quotes and wrap content in double quotes #clean_html = clean_html.replace('"', '""') #clean_html = f'"{clean_html}"' return clean_html # Function to extract HTML content from URL and save to new dataset def extract_nodes_and_feautures(html_content): if html_content: soup = BeautifulSoup(html_content, 'html.parser') cleaned_html = clean_html(str(soup)) feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None) encoding = feature_extractor(cleaned_html) #print(encoding.keys()) row = {} row['nodes'] = encoding['nodes'][0] row['xpaths'] = encoding['xpaths'][0] row['labels'] = encoding['labels'][0] return row else: return pd.Series()