File size: 6,908 Bytes
edc0787
 
b3803c7
6e8591f
edc0787
a0df48e
edc0787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5604c54
edc0787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3803c7
 
 
 
 
 
 
 
 
 
 
 
 
6e8591f
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import hashlib
import datetime
import os
import uuid

from app.utils import logger

logger = logger.get_console_logger("utils")

def create_wikipedia_urls_from_text(text):
    """
    Extracts page titles from a given text and constructs Wikipedia URLs for each title.
    
    Args:
    - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
    
    Returns:
    - list: A list of Wikipedia URLs constructed from the extracted titles.
    """
    # Split the text into sections based on "Page:" prefix
    sections = text.split("Page: ")
    # Remove the first item if it's empty (in case the text starts with "Page:")
    if sections[0].strip() == "":
        sections = sections[1:]
    
    urls = []  # Initialize an empty list to store the URLs
    for section in sections:
        # Extract the title, which is the string up to the first newline
        title = section.split("\n", 1)[0]
        # Replace spaces with underscores for the URL
        url_title = title.replace(" ", "_")
        # Construct the URL and add it to the list
        url = f"https://en.wikipedia.org/wiki/{url_title}"
        urls.append(url)
        #print(urls)
    
    return urls

def extract_urls(data_list):
    """
    Extracts URLs from a list of of dictionaries.

    Parameters:
    - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.

    Returns:
    - list: A list of URLs extracted from the dictionaries.
    """
    urls = []
    print(data_list)
    for item in data_list:
        try:
            # Find the start and end indices of the URL
            lower_case = item.lower()
            link_prefix = 'link: '
            summary_prefix = ', summary:'
            start_idx = lower_case.index(link_prefix) + len(link_prefix)
            end_idx = lower_case.index(summary_prefix, start_idx)
            # Extract the URL using the indices found
            url = item[start_idx:end_idx]
            urls.append(url)
        except ValueError:
            # Handles the case where 'link: ' or ', summary:' is not found in the string
            print("Could not find a URL in the item:", item)
    last_sources = urls[-3:]
    return last_sources

def format_wiki_summaries(input_text):
    """
    Parses a given text containing page titles and summaries, formats them into a list of strings,
    and appends Wikipedia URLs based on titles.
    
    Parameters:
    - input_text (str): A string containing titles and summaries separated by specific markers.
    
    Returns:
    - list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
    """
    # Splitting the input text into individual records based on double newlines
    records = input_text.split("\n\n")
    
    formatted_records_with_urls = []
    for record in records:
        if "Page:" in record and "Summary:" in record:
            title_line, summary_line = record.split("\n", 1)  # Splitting only on the first newline
            title = title_line.replace("Page: ", "").strip()
            summary = summary_line.replace("Summary: ", "").strip()
            # Replace spaces with underscores for the URL and construct the Wikipedia URL
            url_title = title.replace(" ", "_")
            wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
            # Append formatted string with title, summary, and URL
            formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
                title=title, summary=summary, wikipedia_url=wikipedia_url)
            formatted_records_with_urls.append(formatted_record)
        else:
            print("Record format error, skipping record:", record)
    
    return formatted_records_with_urls

def format_arxiv_documents(documents):
    """
    Formats a list of document objects into a list of strings.
    Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
    and a 'page_content' attribute for content.

    Parameters:
    - documents (list): A list of document objects.

    Returns:
    - list: A list of formatted strings with titles, links, and content snippets.
    """
    formatted_documents = [
        "Title: {title}, Link: {link}, Summary: {snippet}".format(
            title=doc.metadata['Title'],
            link=doc.metadata['Entry ID'],
            snippet=doc.page_content  # Adjust the snippet length as needed
        )
        for doc in documents
    ]
    return formatted_documents

def format_search_results(search_results):
    """
    Formats a list of dictionaries containing search results into a list of strings.
    Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.

    Parameters:
    - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.

    Returns:
    - list: A list of formatted strings based on the search results.
    """
    formatted_results = [
        "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
        for i in search_results
    ]
    return formatted_results

def parse_list_to_dicts(items: list) -> list:
    parsed_items = []
    for item in items:
        # Extract title, link, and summary from each string
        title_start = item.find('Title: ') + len('Title: ')
        link_start = item.find('Link: ') + len('Link: ')
        summary_start = item.find('Summary: ') + len('Summary: ')

        title_end = item.find(', Link: ')
        link_end = item.find(', Summary: ')
        summary_end = len(item)

        title = item[title_start:title_end]
        link = item[link_start:link_end]
        summary = item[summary_start:summary_end]

        # Use the hash_text function for the hash_id
        hash_id = hash_text(link)

        # Construct the dictionary for each item
        parsed_item = {
            "url": link,
            "title": title,
            "hash_id": hash_id,
            "summary": summary
        }
        parsed_items.append(parsed_item)
    return parsed_items

def hash_text(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()


def convert_timestamp_to_datetime(timestamp: str) -> str:
    return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")

def create_folder_if_not_exists(folder_path: str) -> None:
    """
    Create a folder if it doesn't already exist.

    Args:
    - folder_path (str): The path of the folder to create.
    """
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created.")
    else:
        print(f"Folder '{folder_path}' already exists.")
        
def generate_uuid() -> str:
    """
    Generate a UUID (Universally Unique Identifier) and return it as a string.

    Returns:
        str: A UUID string.
    """
    return str(uuid.uuid4())