Spaces:
Sleeping
Sleeping
File size: 6,908 Bytes
edc0787 b3803c7 6e8591f edc0787 a0df48e edc0787 5604c54 edc0787 b3803c7 6e8591f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import hashlib
import datetime
import os
import uuid
from app.utils import logger
logger = logger.get_console_logger("utils")
def create_wikipedia_urls_from_text(text):
"""
Extracts page titles from a given text and constructs Wikipedia URLs for each title.
Args:
- text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
Returns:
- list: A list of Wikipedia URLs constructed from the extracted titles.
"""
# Split the text into sections based on "Page:" prefix
sections = text.split("Page: ")
# Remove the first item if it's empty (in case the text starts with "Page:")
if sections[0].strip() == "":
sections = sections[1:]
urls = [] # Initialize an empty list to store the URLs
for section in sections:
# Extract the title, which is the string up to the first newline
title = section.split("\n", 1)[0]
# Replace spaces with underscores for the URL
url_title = title.replace(" ", "_")
# Construct the URL and add it to the list
url = f"https://en.wikipedia.org/wiki/{url_title}"
urls.append(url)
#print(urls)
return urls
def extract_urls(data_list):
"""
Extracts URLs from a list of of dictionaries.
Parameters:
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
Returns:
- list: A list of URLs extracted from the dictionaries.
"""
urls = []
print(data_list)
for item in data_list:
try:
# Find the start and end indices of the URL
lower_case = item.lower()
link_prefix = 'link: '
summary_prefix = ', summary:'
start_idx = lower_case.index(link_prefix) + len(link_prefix)
end_idx = lower_case.index(summary_prefix, start_idx)
# Extract the URL using the indices found
url = item[start_idx:end_idx]
urls.append(url)
except ValueError:
# Handles the case where 'link: ' or ', summary:' is not found in the string
print("Could not find a URL in the item:", item)
last_sources = urls[-3:]
return last_sources
def format_wiki_summaries(input_text):
"""
Parses a given text containing page titles and summaries, formats them into a list of strings,
and appends Wikipedia URLs based on titles.
Parameters:
- input_text (str): A string containing titles and summaries separated by specific markers.
Returns:
- list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
"""
# Splitting the input text into individual records based on double newlines
records = input_text.split("\n\n")
formatted_records_with_urls = []
for record in records:
if "Page:" in record and "Summary:" in record:
title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline
title = title_line.replace("Page: ", "").strip()
summary = summary_line.replace("Summary: ", "").strip()
# Replace spaces with underscores for the URL and construct the Wikipedia URL
url_title = title.replace(" ", "_")
wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
# Append formatted string with title, summary, and URL
formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
title=title, summary=summary, wikipedia_url=wikipedia_url)
formatted_records_with_urls.append(formatted_record)
else:
print("Record format error, skipping record:", record)
return formatted_records_with_urls
def format_arxiv_documents(documents):
"""
Formats a list of document objects into a list of strings.
Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
and a 'page_content' attribute for content.
Parameters:
- documents (list): A list of document objects.
Returns:
- list: A list of formatted strings with titles, links, and content snippets.
"""
formatted_documents = [
"Title: {title}, Link: {link}, Summary: {snippet}".format(
title=doc.metadata['Title'],
link=doc.metadata['Entry ID'],
snippet=doc.page_content # Adjust the snippet length as needed
)
for doc in documents
]
return formatted_documents
def format_search_results(search_results):
"""
Formats a list of dictionaries containing search results into a list of strings.
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
Parameters:
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
Returns:
- list: A list of formatted strings based on the search results.
"""
formatted_results = [
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
for i in search_results
]
return formatted_results
def parse_list_to_dicts(items: list) -> list:
parsed_items = []
for item in items:
# Extract title, link, and summary from each string
title_start = item.find('Title: ') + len('Title: ')
link_start = item.find('Link: ') + len('Link: ')
summary_start = item.find('Summary: ') + len('Summary: ')
title_end = item.find(', Link: ')
link_end = item.find(', Summary: ')
summary_end = len(item)
title = item[title_start:title_end]
link = item[link_start:link_end]
summary = item[summary_start:summary_end]
# Use the hash_text function for the hash_id
hash_id = hash_text(link)
# Construct the dictionary for each item
parsed_item = {
"url": link,
"title": title,
"hash_id": hash_id,
"summary": summary
}
parsed_items.append(parsed_item)
return parsed_items
def hash_text(text: str) -> str:
return hashlib.md5(text.encode()).hexdigest()
def convert_timestamp_to_datetime(timestamp: str) -> str:
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
def create_folder_if_not_exists(folder_path: str) -> None:
"""
Create a folder if it doesn't already exist.
Args:
- folder_path (str): The path of the folder to create.
"""
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder '{folder_path}' created.")
else:
print(f"Folder '{folder_path}' already exists.")
def generate_uuid() -> str:
"""
Generate a UUID (Universally Unique Identifier) and return it as a string.
Returns:
str: A UUID string.
"""
return str(uuid.uuid4()) |