Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Commit
•
66111ac
1
Parent(s):
3a8e960
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
# Standard library imports
|
2 |
import datetime
|
3 |
import base64
|
@@ -16,12 +18,16 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
|
|
|
|
|
|
|
19 |
load_dotenv()
|
20 |
-
|
21 |
|
22 |
# Initialize Cohere client
|
23 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
24 |
co = cohere.Client(COHERE_API_KEY)
|
|
|
25 |
|
26 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
27 |
IS_LOCAL = False
|
@@ -50,6 +56,7 @@ def setup_streamlit():
|
|
50 |
st.set_page_config(page_title="Keyword Relevance Test", layout="wide")
|
51 |
st.title("Keyword Relevance Test Using Vector Embedding")
|
52 |
st.divider()
|
|
|
53 |
|
54 |
def init_session_state():
|
55 |
if 'selected_property' not in st.session_state:
|
@@ -70,47 +77,55 @@ def init_session_state():
|
|
70 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
71 |
if 'custom_end_date' not in st.session_state:
|
72 |
st.session_state.custom_end_date = datetime.date.today()
|
|
|
73 |
|
74 |
# -------------
|
75 |
# Data Processing Functions
|
76 |
# -------------
|
77 |
|
78 |
def fetch_content(url):
|
|
|
79 |
try:
|
80 |
response = requests.get(url)
|
81 |
response.raise_for_status()
|
82 |
soup = BeautifulSoup(response.text, 'html.parser')
|
83 |
content = soup.get_text(separator=' ', strip=True)
|
|
|
84 |
return content
|
85 |
except requests.RequestException as e:
|
|
|
86 |
return str(e)
|
87 |
|
88 |
def generate_embeddings(text_list, model_type):
|
|
|
89 |
if not text_list:
|
|
|
90 |
return []
|
91 |
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
92 |
input_type = 'search_document'
|
93 |
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
94 |
embeddings = response.embeddings
|
|
|
95 |
return embeddings
|
96 |
|
97 |
def calculate_relevancy_scores(df, model_type):
|
|
|
98 |
try:
|
99 |
page_contents = [fetch_content(url) for url in df['page']]
|
100 |
page_embeddings = generate_embeddings(page_contents, model_type)
|
101 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
102 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
103 |
df = df.assign(relevancy_score=relevancy_scores)
|
|
|
104 |
except Exception as e:
|
|
|
105 |
st.warning(f"Error calculating relevancy scores: {e}")
|
106 |
df = df.assign(relevancy_score=0)
|
107 |
return df
|
108 |
|
109 |
def process_gsc_data(df):
|
110 |
-
|
111 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
112 |
-
|
113 |
-
# Keep only the highest impression query for each page
|
114 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
115 |
|
116 |
if 'relevancy_score' not in df_unique.columns:
|
@@ -119,6 +134,7 @@ def process_gsc_data(df):
|
|
119 |
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
120 |
|
121 |
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
|
|
122 |
return result
|
123 |
|
124 |
# -------------
|
@@ -126,6 +142,7 @@ def process_gsc_data(df):
|
|
126 |
# -------------
|
127 |
|
128 |
def load_config():
|
|
|
129 |
client_config = {
|
130 |
"web": {
|
131 |
"client_id": os.environ["CLIENT_ID"],
|
@@ -135,23 +152,29 @@ def load_config():
|
|
135 |
"redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
|
136 |
}
|
137 |
}
|
|
|
138 |
return client_config
|
139 |
|
140 |
def init_oauth_flow(client_config):
|
|
|
141 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
142 |
flow = Flow.from_client_config(
|
143 |
client_config,
|
144 |
scopes=scopes,
|
145 |
redirect_uri=client_config["web"]["redirect_uris"][0]
|
146 |
)
|
|
|
147 |
return flow
|
148 |
|
149 |
def google_auth(client_config):
|
|
|
150 |
flow = init_oauth_flow(client_config)
|
151 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
|
|
152 |
return flow, auth_url
|
153 |
|
154 |
def auth_search_console(client_config, credentials):
|
|
|
155 |
token = {
|
156 |
"token": credentials.token,
|
157 |
"refresh_token": credentials.refresh_token,
|
@@ -161,6 +184,7 @@ def auth_search_console(client_config, credentials):
|
|
161 |
"scopes": credentials.scopes,
|
162 |
"id_token": getattr(credentials, "id_token", None),
|
163 |
}
|
|
|
164 |
return searchconsole.authenticate(client_config=client_config, credentials=token)
|
165 |
|
166 |
# -------------
|
@@ -168,22 +192,29 @@ def auth_search_console(client_config, credentials):
|
|
168 |
# -------------
|
169 |
|
170 |
def list_gsc_properties(credentials):
|
|
|
171 |
service = build('webmasters', 'v3', credentials=credentials)
|
172 |
site_list = service.sites().list().execute()
|
173 |
-
|
|
|
|
|
174 |
|
175 |
def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
|
|
176 |
query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
|
177 |
if 'device' in dimensions and device_type and device_type != 'All Devices':
|
178 |
query = query.filter('device', 'equals', device_type.lower())
|
179 |
try:
|
180 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
|
|
181 |
return process_gsc_data(df)
|
182 |
except Exception as e:
|
|
|
183 |
show_error(e)
|
184 |
return pd.DataFrame()
|
185 |
|
186 |
def calculate_relevancy_scores(df, model_type):
|
|
|
187 |
with st.spinner('Calculating relevancy scores...'):
|
188 |
try:
|
189 |
page_contents = [fetch_content(url) for url in df['page']]
|
@@ -191,20 +222,23 @@ def calculate_relevancy_scores(df, model_type):
|
|
191 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
192 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
193 |
df = df.assign(relevancy_score=relevancy_scores)
|
|
|
194 |
except Exception as e:
|
|
|
195 |
st.warning(f"Error calculating relevancy scores: {e}")
|
196 |
df = df.assign(relevancy_score=0)
|
197 |
return df
|
198 |
|
199 |
-
|
200 |
# -------------
|
201 |
# Utility Functions
|
202 |
# -------------
|
203 |
|
204 |
def update_dimensions(selected_search_type):
|
|
|
205 |
return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
|
206 |
|
207 |
def calc_date_range(selection, custom_start=None, custom_end=None):
|
|
|
208 |
range_map = {
|
209 |
'Last 7 Days': 7,
|
210 |
'Last 30 Days': 30,
|
@@ -216,15 +250,21 @@ def calc_date_range(selection, custom_start=None, custom_end=None):
|
|
216 |
today = datetime.date.today()
|
217 |
if selection == 'Custom Range':
|
218 |
if custom_start and custom_end:
|
|
|
219 |
return custom_start, custom_end
|
220 |
else:
|
|
|
221 |
return today - datetime.timedelta(days=7), today
|
222 |
-
|
|
|
|
|
223 |
|
224 |
def show_error(e):
|
|
|
225 |
st.error(f"An error occurred: {e}")
|
226 |
|
227 |
def property_change():
|
|
|
228 |
st.session_state.selected_property = st.session_state['selected_property_selector']
|
229 |
|
230 |
# -------------
|
@@ -232,28 +272,33 @@ def property_change():
|
|
232 |
# -------------
|
233 |
|
234 |
def show_dataframe(report):
|
|
|
235 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
236 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
237 |
|
238 |
def download_csv_link(report):
|
|
|
239 |
def to_csv(df):
|
240 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
241 |
csv = to_csv(report)
|
242 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
243 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
244 |
st.markdown(href, unsafe_allow_html=True)
|
|
|
245 |
|
246 |
# -------------
|
247 |
# Streamlit UI Components
|
248 |
# -------------
|
249 |
|
250 |
def show_google_sign_in(auth_url):
|
|
|
251 |
with st.sidebar:
|
252 |
if st.button("Sign in with Google"):
|
253 |
st.write('Please click the link below to sign in:')
|
254 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
255 |
|
256 |
def show_property_selector(properties, account):
|
|
|
257 |
selected_property = st.selectbox(
|
258 |
"Select a Search Console Property:",
|
259 |
properties,
|
@@ -265,6 +310,7 @@ def show_property_selector(properties, account):
|
|
265 |
return account[selected_property]
|
266 |
|
267 |
def show_search_type_selector():
|
|
|
268 |
return st.selectbox(
|
269 |
"Select Search Type:",
|
270 |
SEARCH_TYPES,
|
@@ -273,6 +319,7 @@ def show_search_type_selector():
|
|
273 |
)
|
274 |
|
275 |
def show_model_type_selector():
|
|
|
276 |
return st.selectbox(
|
277 |
"Select the embedding model:",
|
278 |
["english", "multilingual"],
|
@@ -280,6 +327,7 @@ def show_model_type_selector():
|
|
280 |
)
|
281 |
|
282 |
def show_date_range_selector():
|
|
|
283 |
return st.selectbox(
|
284 |
"Select Date Range:",
|
285 |
DATE_RANGE_OPTIONS,
|
@@ -288,10 +336,12 @@ def show_date_range_selector():
|
|
288 |
)
|
289 |
|
290 |
def show_custom_date_inputs():
|
|
|
291 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
292 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
293 |
|
294 |
def show_dimensions_selector(search_type):
|
|
|
295 |
available_dimensions = update_dimensions(search_type)
|
296 |
return st.multiselect(
|
297 |
"Select Dimensions:",
|
@@ -301,57 +351,50 @@ def show_dimensions_selector(search_type):
|
|
301 |
)
|
302 |
|
303 |
def show_paginated_dataframe(report, rows_per_page=20):
|
304 |
-
|
305 |
report['position'] = report['position'].astype(int)
|
306 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
307 |
|
308 |
-
# Format CTR as percentage and relevancy_score with two decimal places
|
309 |
def format_ctr(x):
|
310 |
try:
|
311 |
return f"{float(x):.2%}"
|
312 |
except ValueError:
|
313 |
-
return x
|
314 |
|
315 |
def format_relevancy_score(x):
|
316 |
try:
|
317 |
return f"{float(x):.2f}"
|
318 |
except ValueError:
|
319 |
-
return x
|
320 |
|
321 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
322 |
report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
|
323 |
|
324 |
-
# Create a clickable URL column
|
325 |
def make_clickable(url):
|
326 |
return f'<a href="{url}" target="_blank">{url}</a>'
|
327 |
|
328 |
report['clickable_url'] = report['page'].apply(make_clickable)
|
329 |
|
330 |
-
# Reorder columns to put clickable_url first
|
331 |
columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
|
332 |
report = report[columns]
|
333 |
|
334 |
-
|
335 |
-
sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions')) # Set 'impressions' as default
|
336 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
337 |
|
338 |
ascending = sort_order == "Ascending"
|
339 |
|
340 |
-
# Convert back to numeric for sorting
|
341 |
def safe_float_convert(x):
|
342 |
try:
|
343 |
return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
|
344 |
except ValueError:
|
345 |
-
return 0
|
346 |
|
347 |
report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
|
348 |
report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
|
349 |
|
350 |
-
# Sort using the numeric columns
|
351 |
sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
|
352 |
report = report.sort_values(by=sort_column_numeric, ascending=ascending)
|
353 |
|
354 |
-
# Remove the temporary numeric columns
|
355 |
report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
|
356 |
|
357 |
total_rows = len(report)
|
@@ -373,26 +416,23 @@ def show_paginated_dataframe(report, rows_per_page=20):
|
|
373 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
374 |
end_idx = start_idx + rows_per_page
|
375 |
|
376 |
-
# Use st.markdown to display the dataframe with clickable links
|
377 |
st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
|
|
|
378 |
# -------------
|
379 |
# Main Streamlit App Function
|
380 |
# -------------
|
381 |
|
382 |
def main():
|
|
|
383 |
setup_streamlit()
|
384 |
client_config = load_config()
|
385 |
|
386 |
if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
|
387 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
388 |
|
389 |
-
# Directly access query parameters using st.query_params
|
390 |
query_params = st.query_params
|
391 |
-
|
392 |
-
# Retrieve the 'code' parameter
|
393 |
auth_code = query_params.get("code", None)
|
394 |
|
395 |
-
|
396 |
if auth_code and 'credentials' not in st.session_state:
|
397 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
398 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
@@ -408,7 +448,7 @@ def main():
|
|
408 |
webproperty = show_property_selector(properties, account)
|
409 |
search_type = show_search_type_selector()
|
410 |
date_range_selection = show_date_range_selector()
|
411 |
-
model_type = show_model_type_selector()
|
412 |
if date_range_selection == 'Custom Range':
|
413 |
show_custom_date_inputs()
|
414 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
@@ -433,7 +473,8 @@ def main():
|
|
433 |
download_csv_link(st.session_state.report_data)
|
434 |
elif st.session_state.report_data is not None:
|
435 |
st.warning("No data found for the selected criteria.")
|
|
|
436 |
|
437 |
-
|
438 |
if __name__ == "__main__":
|
|
|
439 |
main()
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
# Standard library imports
|
4 |
import datetime
|
5 |
import base64
|
|
|
18 |
import requests
|
19 |
from bs4 import BeautifulSoup
|
20 |
|
21 |
+
# Configure logging
|
22 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
23 |
+
|
24 |
load_dotenv()
|
25 |
+
logging.info("Environment variables loaded")
|
26 |
|
27 |
# Initialize Cohere client
|
28 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
29 |
co = cohere.Client(COHERE_API_KEY)
|
30 |
+
logging.info("Cohere client initialized")
|
31 |
|
32 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
33 |
IS_LOCAL = False
|
|
|
56 |
st.set_page_config(page_title="Keyword Relevance Test", layout="wide")
|
57 |
st.title("Keyword Relevance Test Using Vector Embedding")
|
58 |
st.divider()
|
59 |
+
logging.info("Streamlit app configured")
|
60 |
|
61 |
def init_session_state():
|
62 |
if 'selected_property' not in st.session_state:
|
|
|
77 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
78 |
if 'custom_end_date' not in st.session_state:
|
79 |
st.session_state.custom_end_date = datetime.date.today()
|
80 |
+
logging.info("Session state initialized")
|
81 |
|
82 |
# -------------
|
83 |
# Data Processing Functions
|
84 |
# -------------
|
85 |
|
86 |
def fetch_content(url):
|
87 |
+
logging.debug(f"Fetching content from URL: {url}")
|
88 |
try:
|
89 |
response = requests.get(url)
|
90 |
response.raise_for_status()
|
91 |
soup = BeautifulSoup(response.text, 'html.parser')
|
92 |
content = soup.get_text(separator=' ', strip=True)
|
93 |
+
logging.debug(f"Content fetched successfully from URL: {url}")
|
94 |
return content
|
95 |
except requests.RequestException as e:
|
96 |
+
logging.error(f"Error fetching content from URL: {url} - {e}")
|
97 |
return str(e)
|
98 |
|
99 |
def generate_embeddings(text_list, model_type):
|
100 |
+
logging.debug(f"Generating embeddings for model type: {model_type}")
|
101 |
if not text_list:
|
102 |
+
logging.warning("Text list is empty, returning empty embeddings")
|
103 |
return []
|
104 |
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
105 |
input_type = 'search_document'
|
106 |
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
107 |
embeddings = response.embeddings
|
108 |
+
logging.debug(f"Embeddings generated successfully for model type: {model_type}")
|
109 |
return embeddings
|
110 |
|
111 |
def calculate_relevancy_scores(df, model_type):
|
112 |
+
logging.info("Calculating relevancy scores")
|
113 |
try:
|
114 |
page_contents = [fetch_content(url) for url in df['page']]
|
115 |
page_embeddings = generate_embeddings(page_contents, model_type)
|
116 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
117 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
118 |
df = df.assign(relevancy_score=relevancy_scores)
|
119 |
+
logging.info("Relevancy scores calculated successfully")
|
120 |
except Exception as e:
|
121 |
+
logging.error(f"Error calculating relevancy scores: {e}")
|
122 |
st.warning(f"Error calculating relevancy scores: {e}")
|
123 |
df = df.assign(relevancy_score=0)
|
124 |
return df
|
125 |
|
126 |
def process_gsc_data(df):
|
127 |
+
logging.info("Processing GSC data")
|
128 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
|
|
|
|
129 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
130 |
|
131 |
if 'relevancy_score' not in df_unique.columns:
|
|
|
134 |
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
135 |
|
136 |
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
137 |
+
logging.info("GSC data processed successfully")
|
138 |
return result
|
139 |
|
140 |
# -------------
|
|
|
142 |
# -------------
|
143 |
|
144 |
def load_config():
|
145 |
+
logging.info("Loading Google client configuration")
|
146 |
client_config = {
|
147 |
"web": {
|
148 |
"client_id": os.environ["CLIENT_ID"],
|
|
|
152 |
"redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
|
153 |
}
|
154 |
}
|
155 |
+
logging.info("Google client configuration loaded")
|
156 |
return client_config
|
157 |
|
158 |
def init_oauth_flow(client_config):
|
159 |
+
logging.info("Initializing OAuth flow")
|
160 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
161 |
flow = Flow.from_client_config(
|
162 |
client_config,
|
163 |
scopes=scopes,
|
164 |
redirect_uri=client_config["web"]["redirect_uris"][0]
|
165 |
)
|
166 |
+
logging.info("OAuth flow initialized")
|
167 |
return flow
|
168 |
|
169 |
def google_auth(client_config):
|
170 |
+
logging.info("Starting Google authentication")
|
171 |
flow = init_oauth_flow(client_config)
|
172 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
173 |
+
logging.info("Google authentication URL generated")
|
174 |
return flow, auth_url
|
175 |
|
176 |
def auth_search_console(client_config, credentials):
|
177 |
+
logging.info("Authenticating with Google Search Console")
|
178 |
token = {
|
179 |
"token": credentials.token,
|
180 |
"refresh_token": credentials.refresh_token,
|
|
|
184 |
"scopes": credentials.scopes,
|
185 |
"id_token": getattr(credentials, "id_token", None),
|
186 |
}
|
187 |
+
logging.info("Google Search Console authenticated")
|
188 |
return searchconsole.authenticate(client_config=client_config, credentials=token)
|
189 |
|
190 |
# -------------
|
|
|
192 |
# -------------
|
193 |
|
194 |
def list_gsc_properties(credentials):
|
195 |
+
logging.info("Listing GSC properties")
|
196 |
service = build('webmasters', 'v3', credentials=credentials)
|
197 |
site_list = service.sites().list().execute()
|
198 |
+
properties = [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
199 |
+
logging.info(f"GSC properties listed: {properties}")
|
200 |
+
return properties
|
201 |
|
202 |
def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
203 |
+
logging.info(f"Fetching GSC data for property: {webproperty}, search_type: {search_type}, date_range: {start_date} to {end_date}, dimensions: {dimensions}, device_type: {device_type}")
|
204 |
query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
|
205 |
if 'device' in dimensions and device_type and device_type != 'All Devices':
|
206 |
query = query.filter('device', 'equals', device_type.lower())
|
207 |
try:
|
208 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
209 |
+
logging.info("GSC data fetched successfully")
|
210 |
return process_gsc_data(df)
|
211 |
except Exception as e:
|
212 |
+
logging.error(f"Error fetching GSC data: {e}")
|
213 |
show_error(e)
|
214 |
return pd.DataFrame()
|
215 |
|
216 |
def calculate_relevancy_scores(df, model_type):
|
217 |
+
logging.info("Calculating relevancy scores")
|
218 |
with st.spinner('Calculating relevancy scores...'):
|
219 |
try:
|
220 |
page_contents = [fetch_content(url) for url in df['page']]
|
|
|
222 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
223 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
224 |
df = df.assign(relevancy_score=relevancy_scores)
|
225 |
+
logging.info("Relevancy scores calculated successfully")
|
226 |
except Exception as e:
|
227 |
+
logging.error(f"Error calculating relevancy scores: {e}")
|
228 |
st.warning(f"Error calculating relevancy scores: {e}")
|
229 |
df = df.assign(relevancy_score=0)
|
230 |
return df
|
231 |
|
|
|
232 |
# -------------
|
233 |
# Utility Functions
|
234 |
# -------------
|
235 |
|
236 |
def update_dimensions(selected_search_type):
|
237 |
+
logging.debug(f"Updating dimensions for search type: {selected_search_type}")
|
238 |
return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
|
239 |
|
240 |
def calc_date_range(selection, custom_start=None, custom_end=None):
|
241 |
+
logging.debug(f"Calculating date range for selection: {selection}")
|
242 |
range_map = {
|
243 |
'Last 7 Days': 7,
|
244 |
'Last 30 Days': 30,
|
|
|
250 |
today = datetime.date.today()
|
251 |
if selection == 'Custom Range':
|
252 |
if custom_start and custom_end:
|
253 |
+
logging.debug(f"Custom date range: {custom_start} to {custom_end}")
|
254 |
return custom_start, custom_end
|
255 |
else:
|
256 |
+
logging.debug("Defaulting custom date range to last 7 days")
|
257 |
return today - datetime.timedelta(days=7), today
|
258 |
+
date_range = today - datetime.timedelta(days=range_map.get(selection, 0)), today
|
259 |
+
logging.debug(f"Date range calculated: {date_range}")
|
260 |
+
return date_range
|
261 |
|
262 |
def show_error(e):
|
263 |
+
logging.error(f"An error occurred: {e}")
|
264 |
st.error(f"An error occurred: {e}")
|
265 |
|
266 |
def property_change():
|
267 |
+
logging.info(f"Property changed to: {st.session_state['selected_property_selector']}")
|
268 |
st.session_state.selected_property = st.session_state['selected_property_selector']
|
269 |
|
270 |
# -------------
|
|
|
272 |
# -------------
|
273 |
|
274 |
def show_dataframe(report):
|
275 |
+
logging.info("Showing dataframe preview")
|
276 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
277 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
278 |
|
279 |
def download_csv_link(report):
|
280 |
+
logging.info("Generating CSV download link")
|
281 |
def to_csv(df):
|
282 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
283 |
csv = to_csv(report)
|
284 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
285 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
286 |
st.markdown(href, unsafe_allow_html=True)
|
287 |
+
logging.info("CSV download link generated")
|
288 |
|
289 |
# -------------
|
290 |
# Streamlit UI Components
|
291 |
# -------------
|
292 |
|
293 |
def show_google_sign_in(auth_url):
|
294 |
+
logging.info("Showing Google sign-in button")
|
295 |
with st.sidebar:
|
296 |
if st.button("Sign in with Google"):
|
297 |
st.write('Please click the link below to sign in:')
|
298 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
299 |
|
300 |
def show_property_selector(properties, account):
|
301 |
+
logging.info("Showing property selector")
|
302 |
selected_property = st.selectbox(
|
303 |
"Select a Search Console Property:",
|
304 |
properties,
|
|
|
310 |
return account[selected_property]
|
311 |
|
312 |
def show_search_type_selector():
|
313 |
+
logging.info("Showing search type selector")
|
314 |
return st.selectbox(
|
315 |
"Select Search Type:",
|
316 |
SEARCH_TYPES,
|
|
|
319 |
)
|
320 |
|
321 |
def show_model_type_selector():
|
322 |
+
logging.info("Showing model type selector")
|
323 |
return st.selectbox(
|
324 |
"Select the embedding model:",
|
325 |
["english", "multilingual"],
|
|
|
327 |
)
|
328 |
|
329 |
def show_date_range_selector():
|
330 |
+
logging.info("Showing date range selector")
|
331 |
return st.selectbox(
|
332 |
"Select Date Range:",
|
333 |
DATE_RANGE_OPTIONS,
|
|
|
336 |
)
|
337 |
|
338 |
def show_custom_date_inputs():
|
339 |
+
logging.info("Showing custom date inputs")
|
340 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
341 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
342 |
|
343 |
def show_dimensions_selector(search_type):
|
344 |
+
logging.info("Showing dimensions selector")
|
345 |
available_dimensions = update_dimensions(search_type)
|
346 |
return st.multiselect(
|
347 |
"Select Dimensions:",
|
|
|
351 |
)
|
352 |
|
353 |
def show_paginated_dataframe(report, rows_per_page=20):
|
354 |
+
logging.info("Showing paginated dataframe")
|
355 |
report['position'] = report['position'].astype(int)
|
356 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
357 |
|
|
|
358 |
def format_ctr(x):
|
359 |
try:
|
360 |
return f"{float(x):.2%}"
|
361 |
except ValueError:
|
362 |
+
return x
|
363 |
|
364 |
def format_relevancy_score(x):
|
365 |
try:
|
366 |
return f"{float(x):.2f}"
|
367 |
except ValueError:
|
368 |
+
return x
|
369 |
|
370 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
371 |
report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
|
372 |
|
|
|
373 |
def make_clickable(url):
|
374 |
return f'<a href="{url}" target="_blank">{url}</a>'
|
375 |
|
376 |
report['clickable_url'] = report['page'].apply(make_clickable)
|
377 |
|
|
|
378 |
columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
|
379 |
report = report[columns]
|
380 |
|
381 |
+
sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions'))
|
|
|
382 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
383 |
|
384 |
ascending = sort_order == "Ascending"
|
385 |
|
|
|
386 |
def safe_float_convert(x):
|
387 |
try:
|
388 |
return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
|
389 |
except ValueError:
|
390 |
+
return 0
|
391 |
|
392 |
report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
|
393 |
report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
|
394 |
|
|
|
395 |
sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
|
396 |
report = report.sort_values(by=sort_column_numeric, ascending=ascending)
|
397 |
|
|
|
398 |
report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
|
399 |
|
400 |
total_rows = len(report)
|
|
|
416 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
417 |
end_idx = start_idx + rows_per_page
|
418 |
|
|
|
419 |
st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
|
420 |
+
|
421 |
# -------------
|
422 |
# Main Streamlit App Function
|
423 |
# -------------
|
424 |
|
425 |
def main():
|
426 |
+
logging.info("Starting main function")
|
427 |
setup_streamlit()
|
428 |
client_config = load_config()
|
429 |
|
430 |
if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
|
431 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
432 |
|
|
|
433 |
query_params = st.query_params
|
|
|
|
|
434 |
auth_code = query_params.get("code", None)
|
435 |
|
|
|
436 |
if auth_code and 'credentials' not in st.session_state:
|
437 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
438 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
|
|
448 |
webproperty = show_property_selector(properties, account)
|
449 |
search_type = show_search_type_selector()
|
450 |
date_range_selection = show_date_range_selector()
|
451 |
+
model_type = show_model_type_selector()
|
452 |
if date_range_selection == 'Custom Range':
|
453 |
show_custom_date_inputs()
|
454 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
|
|
473 |
download_csv_link(st.session_state.report_data)
|
474 |
elif st.session_state.report_data is not None:
|
475 |
st.warning("No data found for the selected criteria.")
|
476 |
+
logging.warning("No data found for the selected criteria")
|
477 |
|
|
|
478 |
if __name__ == "__main__":
|
479 |
+
logging.info("Running main function")
|
480 |
main()
|