Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Commit
•
67a6275
1
Parent(s):
9f4df6e
Update app.py
Browse files
app.py
CHANGED
@@ -120,8 +120,6 @@ def init_session_state():
|
|
120 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
121 |
if 'custom_end_date' not in st.session_state:
|
122 |
st.session_state.custom_end_date = datetime.date.today()
|
123 |
-
if 'relevancy_scores' not in st.session_state:
|
124 |
-
st.session_state.relevancy_scores = {}
|
125 |
#logging.info("Session state initialized")
|
126 |
|
127 |
# -------------
|
@@ -223,21 +221,20 @@ def fetch_content(url, query):
|
|
223 |
except requests.RequestException:
|
224 |
return ""
|
225 |
|
226 |
-
def calculate_relevance_score(page_content, query, co
|
|
|
227 |
try:
|
228 |
-
if not page_content
|
229 |
-
|
230 |
-
return 0
|
231 |
-
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
232 |
-
embeddings = co.embed(texts=[page_content, query], model=model, input_type=['search_document', 'search_query'])
|
233 |
-
page_embedding = embeddings.embeddings[0]
|
234 |
-
query_embedding = embeddings.embeddings[1]
|
235 |
-
if not any(page_embedding) or not any(query_embedding):
|
236 |
-
st.warning("One of the embeddings is empty. Returning a score of 0.")
|
237 |
return 0
|
|
|
|
|
|
|
238 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
|
|
239 |
return score
|
240 |
except Exception as e:
|
|
|
241 |
st.error(f"Error calculating relevance score: {str(e)}")
|
242 |
return 0
|
243 |
|
@@ -247,47 +244,46 @@ def normalize_url(url):
|
|
247 |
def analyze_competitors(row, co, custom_url=None, country_code=None):
|
248 |
query = row['query']
|
249 |
our_url = normalize_url(row['page'])
|
250 |
-
|
251 |
-
|
252 |
competitor_data = get_serp_results(query, country_code)
|
253 |
-
|
254 |
results = []
|
255 |
-
our_url_found = False # Flag to check if our URL is in the results
|
256 |
-
|
257 |
for data in competitor_data:
|
258 |
competitor_url = normalize_url(data['url'])
|
259 |
-
score = calculate_relevance_score(data['content'], query, co
|
260 |
-
is_our = competitor_url == our_url
|
261 |
-
if is_our:
|
262 |
-
our_url_found = True
|
263 |
results.append({
|
264 |
'Position': data['position'],
|
265 |
'URL': competitor_url,
|
266 |
'Score': score,
|
267 |
-
'is_our_url':
|
268 |
})
|
269 |
-
|
270 |
-
|
271 |
-
our_score =
|
272 |
-
|
273 |
-
if not
|
274 |
results.append({
|
275 |
'Position': len(results) + 1,
|
276 |
-
'URL':
|
277 |
'Score': our_score,
|
278 |
'is_our_url': True
|
279 |
})
|
280 |
-
|
281 |
-
# Sort results by position
|
282 |
results = sorted(results, key=lambda x: x['Position'])
|
283 |
-
|
284 |
# Create DataFrame
|
285 |
results_df = pd.DataFrame(results)
|
286 |
results_df['Position'] = results_df['Position'].astype(int)
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
288 |
# Keep only the columns we want to display
|
289 |
results_df = results_df[['Position', 'URL', 'Score']]
|
290 |
-
|
291 |
return results_df
|
292 |
|
293 |
def show_competitor_analysis(row, co, country_code):
|
@@ -411,10 +407,19 @@ def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, d
|
|
411 |
|
412 |
|
413 |
def calculate_relevancy_scores(df, model_type):
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
return df
|
419 |
|
420 |
# -------------
|
@@ -517,9 +522,7 @@ def show_model_type_selector():
|
|
517 |
def calculate_single_relevancy(row):
|
518 |
page_content = fetch_content(row['page'], row['query'])
|
519 |
query = row['query']
|
520 |
-
|
521 |
-
score = calculate_relevance_score(page_content, query, co, model_type=model_type)
|
522 |
-
st.session_state.relevancy_scores[normalize_url(row['page'])] = score # Ensure score is stored
|
523 |
return score
|
524 |
|
525 |
def compare_with_top_result(row, co, country_code):
|
@@ -540,11 +543,9 @@ def compare_with_top_result(row, co, country_code):
|
|
540 |
our_content = fetch_content(our_url, query)
|
541 |
top_content = top_result['content']
|
542 |
|
543 |
-
# Retrieve "Our Score" from the main data table
|
544 |
-
our_score = st.session_state['relevancy_scores'].get(normalize_url(our_url), 0)
|
545 |
-
|
546 |
# Calculate relevancy scores
|
547 |
-
|
|
|
548 |
|
549 |
# Prepare prompt for GPT-4
|
550 |
prompt = f"""
|
@@ -575,12 +576,12 @@ def compare_with_top_result(row, co, country_code):
|
|
575 |
|
576 |
# Display results
|
577 |
st.subheader("Content Comparison Analysis")
|
578 |
-
st.write(f"
|
579 |
-
st.write(f"
|
580 |
-
st.write(f"
|
581 |
-
st.write(f"
|
582 |
-
st.write(f"
|
583 |
-
st.write("
|
584 |
st.write(analysis)
|
585 |
except Exception as e:
|
586 |
st.error(f"Error in GPT-4 analysis: {str(e)}")
|
@@ -628,16 +629,12 @@ def show_tabular_data(df, co, country_code):
|
|
628 |
)
|
629 |
if st.button("Click here to calculate relevancy for selected pages"):
|
630 |
selected_indices = [i for i, selected in enumerate(st.session_state.selected_rows) if selected]
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
st.success(f"Calculated relevancy scores for {len(selected_indices)} selected rows.")
|
638 |
-
st.session_state.report_data = df # Update the report_data in session state
|
639 |
-
else:
|
640 |
-
st.warning("No rows selected. Please select at least one row to calculate relevancy.")
|
641 |
|
642 |
# Display column headers
|
643 |
cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
|
@@ -650,8 +647,9 @@ def show_tabular_data(df, co, country_code):
|
|
650 |
cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
|
651 |
|
652 |
# Checkbox for row selection
|
653 |
-
cols[0].checkbox("
|
654 |
-
on_change=lambda idx=i:
|
|
|
655 |
|
656 |
# Truncate and make the URL clickable
|
657 |
truncated_url = row.page[:30] + '...' if len(row.page) > 30 else row.page
|
@@ -680,7 +678,7 @@ def show_tabular_data(df, co, country_code):
|
|
680 |
if st.session_state[competitor_state_key]:
|
681 |
st.write(f"Competitor Analysis for: {row.query}")
|
682 |
with st.spinner('Analyzing competitors...'):
|
683 |
-
results_df = analyze_competitors(row, co, country_code=country_code)
|
684 |
|
685 |
# Sort the results by Position in ascending order
|
686 |
results_df = results_df.sort_values('Position', ascending=True).reset_index(drop=True)
|
@@ -720,11 +718,11 @@ def show_tabular_data(df, co, country_code):
|
|
720 |
st.warning("Your page's relevancy score is in the lower half of the results. Consider optimizing your content.")
|
721 |
else:
|
722 |
st.error(f"Our page '{row.page}' is not in the results. This indicates an error in fetching or processing the page.")
|
723 |
-
|
724 |
if compare_state_key not in st.session_state:
|
725 |
st.session_state[compare_state_key] = False
|
726 |
|
727 |
-
if
|
728 |
st.session_state[compare_state_key] = True
|
729 |
|
730 |
if st.session_state[compare_state_key]:
|
@@ -734,9 +732,6 @@ def show_tabular_data(df, co, country_code):
|
|
734 |
|
735 |
return df # Return the updated dataframe
|
736 |
|
737 |
-
def update_selected_rows(idx):
|
738 |
-
st.session_state.selected_rows[idx] = not st.session_state.selected_rows[idx]
|
739 |
-
|
740 |
def show_date_range_selector():
|
741 |
# logging.info("Showing date range selector")
|
742 |
return st.selectbox(
|
@@ -889,8 +884,7 @@ def main():
|
|
889 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
890 |
st.write("Data fetched successfully.")
|
891 |
|
892 |
-
|
893 |
-
st.session_state.report_data = updated_df # Update the report_data with the potentially modified dataframe
|
894 |
|
895 |
download_csv_link(st.session_state.report_data)
|
896 |
elif st.session_state.report_data is not None:
|
|
|
120 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
121 |
if 'custom_end_date' not in st.session_state:
|
122 |
st.session_state.custom_end_date = datetime.date.today()
|
|
|
|
|
123 |
#logging.info("Session state initialized")
|
124 |
|
125 |
# -------------
|
|
|
221 |
except requests.RequestException:
|
222 |
return ""
|
223 |
|
224 |
+
def calculate_relevance_score(page_content, query, co):
|
225 |
+
# logger.info(f"Calculating relevance score for query: {query}")
|
226 |
try:
|
227 |
+
if not page_content:
|
228 |
+
# logger.warning("Empty page content. Returning score 0.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
return 0
|
230 |
+
|
231 |
+
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
232 |
+
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
233 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
234 |
+
# logger.debug(f"Relevance score calculated: {score}")
|
235 |
return score
|
236 |
except Exception as e:
|
237 |
+
# logger.exception(f"Error calculating relevance score: {str(e)}")
|
238 |
st.error(f"Error calculating relevance score: {str(e)}")
|
239 |
return 0
|
240 |
|
|
|
244 |
def analyze_competitors(row, co, custom_url=None, country_code=None):
|
245 |
query = row['query']
|
246 |
our_url = normalize_url(row['page'])
|
247 |
+
|
|
|
248 |
competitor_data = get_serp_results(query, country_code)
|
249 |
+
|
250 |
results = []
|
|
|
|
|
251 |
for data in competitor_data:
|
252 |
competitor_url = normalize_url(data['url'])
|
253 |
+
score = calculate_relevance_score(data['content'], query, co)
|
|
|
|
|
|
|
254 |
results.append({
|
255 |
'Position': data['position'],
|
256 |
'URL': competitor_url,
|
257 |
'Score': score,
|
258 |
+
'is_our_url': competitor_url == our_url
|
259 |
})
|
260 |
+
|
261 |
+
our_content = fetch_content(our_url, query)
|
262 |
+
our_score = calculate_relevance_score(our_content, query, co)
|
263 |
+
|
264 |
+
if not any(r['is_our_url'] for r in results):
|
265 |
results.append({
|
266 |
'Position': len(results) + 1,
|
267 |
+
'URL': our_url,
|
268 |
'Score': our_score,
|
269 |
'is_our_url': True
|
270 |
})
|
271 |
+
|
272 |
+
# Sort results by position
|
273 |
results = sorted(results, key=lambda x: x['Position'])
|
274 |
+
|
275 |
# Create DataFrame
|
276 |
results_df = pd.DataFrame(results)
|
277 |
results_df['Position'] = results_df['Position'].astype(int)
|
278 |
+
|
279 |
+
# Mark our URL
|
280 |
+
results_df['URL'] = results_df.apply(
|
281 |
+
lambda x: f"{x['URL']} (Our URL)" if x['is_our_url'] else x['URL'], axis=1
|
282 |
+
)
|
283 |
+
|
284 |
# Keep only the columns we want to display
|
285 |
results_df = results_df[['Position', 'URL', 'Score']]
|
286 |
+
|
287 |
return results_df
|
288 |
|
289 |
def show_competitor_analysis(row, co, country_code):
|
|
|
407 |
|
408 |
|
409 |
def calculate_relevancy_scores(df, model_type):
|
410 |
+
#logging.info("Calculating relevancy scores")
|
411 |
+
with st.spinner('Calculating relevancy scores...'):
|
412 |
+
try:
|
413 |
+
page_contents = [fetch_content(url) for url in df['page']]
|
414 |
+
page_embeddings = generate_embeddings(page_contents, model_type)
|
415 |
+
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
416 |
+
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
417 |
+
df = df.assign(relevancy_score=relevancy_scores)
|
418 |
+
#logging.info("Relevancy scores calculated successfully")
|
419 |
+
except Exception as e:
|
420 |
+
#logging.error(f"Error calculating relevancy scores: {e}")
|
421 |
+
st.warning(f"Error calculating relevancy scores: {e}")
|
422 |
+
df = df.assign(relevancy_score=0)
|
423 |
return df
|
424 |
|
425 |
# -------------
|
|
|
522 |
def calculate_single_relevancy(row):
|
523 |
page_content = fetch_content(row['page'], row['query'])
|
524 |
query = row['query']
|
525 |
+
score = calculate_relevance_score(page_content, query, co)
|
|
|
|
|
526 |
return score
|
527 |
|
528 |
def compare_with_top_result(row, co, country_code):
|
|
|
543 |
our_content = fetch_content(our_url, query)
|
544 |
top_content = top_result['content']
|
545 |
|
|
|
|
|
|
|
546 |
# Calculate relevancy scores
|
547 |
+
our_score = calculate_relevance_score(our_content, query, co)
|
548 |
+
top_score = calculate_relevance_score(top_content, query, co)
|
549 |
|
550 |
# Prepare prompt for GPT-4
|
551 |
prompt = f"""
|
|
|
576 |
|
577 |
# Display results
|
578 |
st.subheader("Content Comparison Analysis")
|
579 |
+
st.write(f"Query: {query}")
|
580 |
+
st.write(f"Top-ranking URL: {top_url}")
|
581 |
+
st.write(f"Our URL: {our_url}")
|
582 |
+
st.write(f"Top-ranking score: {top_score:.4f}")
|
583 |
+
st.write(f"Our score: {our_score:.4f}")
|
584 |
+
st.write("Analysis:")
|
585 |
st.write(analysis)
|
586 |
except Exception as e:
|
587 |
st.error(f"Error in GPT-4 analysis: {str(e)}")
|
|
|
629 |
)
|
630 |
if st.button("Click here to calculate relevancy for selected pages"):
|
631 |
selected_indices = [i for i, selected in enumerate(st.session_state.selected_rows) if selected]
|
632 |
+
with st.spinner('Calculating relevancy scores...'):
|
633 |
+
for index in selected_indices:
|
634 |
+
if pd.isna(df.iloc[index]['relevancy_score']) or df.iloc[index]['relevancy_score'] == 0:
|
635 |
+
df.iloc[index, df.columns.get_loc('relevancy_score')] = calculate_single_relevancy(df.iloc[index])
|
636 |
+
st.success(f"Calculated relevancy scores for {len(selected_indices)} selected rows.")
|
637 |
+
st.experimental_rerun()
|
|
|
|
|
|
|
|
|
638 |
|
639 |
# Display column headers
|
640 |
cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
|
|
|
647 |
cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
|
648 |
|
649 |
# Checkbox for row selection
|
650 |
+
cols[0].checkbox("", key=f"select_{i}", value=st.session_state.selected_rows[i],
|
651 |
+
on_change=lambda idx=i: setattr(st.session_state, 'selected_rows',
|
652 |
+
[True if j == idx else x for j, x in enumerate(st.session_state.selected_rows)]))
|
653 |
|
654 |
# Truncate and make the URL clickable
|
655 |
truncated_url = row.page[:30] + '...' if len(row.page) > 30 else row.page
|
|
|
678 |
if st.session_state[competitor_state_key]:
|
679 |
st.write(f"Competitor Analysis for: {row.query}")
|
680 |
with st.spinner('Analyzing competitors...'):
|
681 |
+
results_df = analyze_competitors(row._asdict(), co, country_code=country_code)
|
682 |
|
683 |
# Sort the results by Position in ascending order
|
684 |
results_df = results_df.sort_values('Position', ascending=True).reset_index(drop=True)
|
|
|
718 |
st.warning("Your page's relevancy score is in the lower half of the results. Consider optimizing your content.")
|
719 |
else:
|
720 |
st.error(f"Our page '{row.page}' is not in the results. This indicates an error in fetching or processing the page.")
|
721 |
+
|
722 |
if compare_state_key not in st.session_state:
|
723 |
st.session_state[compare_state_key] = False
|
724 |
|
725 |
+
if st.button("Compare Your Relevancy Score to the Page In First Place", key=compare_button_key):
|
726 |
st.session_state[compare_state_key] = True
|
727 |
|
728 |
if st.session_state[compare_state_key]:
|
|
|
732 |
|
733 |
return df # Return the updated dataframe
|
734 |
|
|
|
|
|
|
|
735 |
def show_date_range_selector():
|
736 |
# logging.info("Showing date range selector")
|
737 |
return st.selectbox(
|
|
|
884 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
885 |
st.write("Data fetched successfully.")
|
886 |
|
887 |
+
st.session_state.report_data = show_tabular_data(st.session_state.report_data, co, country_code)
|
|
|
888 |
|
889 |
download_csv_link(st.session_state.report_data)
|
890 |
elif st.session_state.report_data is not None:
|