Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Commit
•
f1d6ab9
1
Parent(s):
302324f
Update app.py
Browse files
app.py
CHANGED
@@ -143,7 +143,9 @@ def get_serp_results(query):
|
|
143 |
def fetch_content(url):
|
144 |
logger.info(f"Fetching content from URL: {url}")
|
145 |
try:
|
146 |
-
|
|
|
|
|
147 |
response.raise_for_status()
|
148 |
soup = BeautifulSoup(response.text, 'html.parser')
|
149 |
content = soup.get_text(separator=' ', strip=True)
|
@@ -157,6 +159,10 @@ def fetch_content(url):
|
|
157 |
def calculate_relevance_score(page_content, query, co):
|
158 |
logger.info(f"Calculating relevance score for query: {query}")
|
159 |
try:
|
|
|
|
|
|
|
|
|
160 |
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
161 |
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
162 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
@@ -177,13 +183,12 @@ def analyze_competitors(row, co):
|
|
177 |
results = []
|
178 |
for url in [our_url] + competitor_urls:
|
179 |
try:
|
180 |
-
logger.debug(f"
|
181 |
content = fetch_content(url)
|
182 |
if not content:
|
183 |
logger.warning(f"No content fetched for URL: {url}")
|
184 |
continue
|
185 |
|
186 |
-
logger.debug(f"Calculating relevance score for URL: {url}")
|
187 |
score = calculate_relevance_score(content, query, co)
|
188 |
|
189 |
logger.info(f"URL: {url}, Score: {score}")
|
@@ -209,6 +214,15 @@ def show_competitor_analysis(row, co):
|
|
209 |
if our_data.empty:
|
210 |
st.error(f"Our page '{row['page']}' is not in the results. This indicates an error in fetching or processing the page.")
|
211 |
logger.error(f"Our page '{row['page']}' is missing from the results.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
else:
|
213 |
our_rank = our_data.index[0] + 1
|
214 |
total_results = len(results_df)
|
@@ -220,6 +234,14 @@ def show_competitor_analysis(row, co):
|
|
220 |
|
221 |
if our_score == 0:
|
222 |
st.warning("Our page's relevancy score is 0. This might indicate an issue with content fetching or score calculation.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
elif our_rank == 1:
|
224 |
st.success("Your page has the highest relevancy score!")
|
225 |
elif our_rank <= 3:
|
|
|
143 |
def fetch_content(url):
|
144 |
logger.info(f"Fetching content from URL: {url}")
|
145 |
try:
|
146 |
+
# Decode URL-encoded characters
|
147 |
+
decoded_url = urllib.parse.unquote(url)
|
148 |
+
response = requests.get(decoded_url, timeout=10)
|
149 |
response.raise_for_status()
|
150 |
soup = BeautifulSoup(response.text, 'html.parser')
|
151 |
content = soup.get_text(separator=' ', strip=True)
|
|
|
159 |
def calculate_relevance_score(page_content, query, co):
|
160 |
logger.info(f"Calculating relevance score for query: {query}")
|
161 |
try:
|
162 |
+
if not page_content:
|
163 |
+
logger.warning("Empty page content. Returning score 0.")
|
164 |
+
return 0
|
165 |
+
|
166 |
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
167 |
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
168 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
|
|
183 |
results = []
|
184 |
for url in [our_url] + competitor_urls:
|
185 |
try:
|
186 |
+
logger.debug(f"Processing URL: {url}")
|
187 |
content = fetch_content(url)
|
188 |
if not content:
|
189 |
logger.warning(f"No content fetched for URL: {url}")
|
190 |
continue
|
191 |
|
|
|
192 |
score = calculate_relevance_score(content, query, co)
|
193 |
|
194 |
logger.info(f"URL: {url}, Score: {score}")
|
|
|
214 |
if our_data.empty:
|
215 |
st.error(f"Our page '{row['page']}' is not in the results. This indicates an error in fetching or processing the page.")
|
216 |
logger.error(f"Our page '{row['page']}' is missing from the results.")
|
217 |
+
|
218 |
+
# Additional debugging information
|
219 |
+
st.write("Debugging Information:")
|
220 |
+
st.json({
|
221 |
+
"our_url": row['page'],
|
222 |
+
"query": row['query'],
|
223 |
+
"content_fetched": fetch_content(row['page']),
|
224 |
+
"urls_processed": results_df['url'].tolist()
|
225 |
+
})
|
226 |
else:
|
227 |
our_rank = our_data.index[0] + 1
|
228 |
total_results = len(results_df)
|
|
|
234 |
|
235 |
if our_score == 0:
|
236 |
st.warning("Our page's relevancy score is 0. This might indicate an issue with content fetching or score calculation.")
|
237 |
+
# Additional debugging information
|
238 |
+
st.write("Debugging Information:")
|
239 |
+
content = fetch_content(row['page'])
|
240 |
+
st.json({
|
241 |
+
"content_length": len(content),
|
242 |
+
"content_preview": content[:500] if content else "No content fetched",
|
243 |
+
"query": row['query']
|
244 |
+
})
|
245 |
elif our_rank == 1:
|
246 |
st.success("Your page has the highest relevancy score!")
|
247 |
elif our_rank <= 3:
|