Chris4K commited on
Commit
766e109
·
verified ·
1 Parent(s): d3d2c50

Update services/faq_service.py

Browse files
Files changed (1) hide show
  1. services/faq_service.py +102 -28
services/faq_service.py CHANGED
@@ -1,10 +1,11 @@
1
- # services/faq_service.py
2
  from typing import List, Dict, Any, Optional
3
  import aiohttp
4
  from bs4 import BeautifulSoup
5
  import faiss
6
  import logging
7
  from config.config import settings
 
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
@@ -13,54 +14,118 @@ class FAQService:
13
  self.embedder = model_service.embedder
14
  self.faiss_index = None
15
  self.faq_data = []
 
 
16
 
17
  async def fetch_faq_pages(self) -> List[Dict[str, Any]]:
18
  async with aiohttp.ClientSession() as session:
19
  try:
20
- async with session.get(f"{settings.FAQ_ROOT_URL}sitemap.xml", timeout=settings.TIMEOUT) as response:
21
- if response.status == 200:
22
- sitemap = await response.text()
23
- soup = BeautifulSoup(sitemap, 'xml')
24
- faq_urls = [loc.text for loc in soup.find_all('loc') if "/faq/" in loc.text]
25
-
26
- tasks = [self.fetch_faq_content(url, session) for url in faq_urls]
27
- return await asyncio.gather(*tasks)
28
  except Exception as e:
29
- logger.error(f"Error fetching FAQ sitemap: {e}")
30
  return []
31
 
32
- async def fetch_faq_content(self, url: str, session: aiohttp.ClientSession) -> Optional[Dict[str, Any]]:
 
 
 
 
 
 
33
  try:
34
  async with session.get(url, timeout=settings.TIMEOUT) as response:
35
  if response.status == 200:
36
  content = await response.text()
37
  soup = BeautifulSoup(content, 'html.parser')
 
 
 
 
 
38
 
39
- faq_title = soup.find('h1').text.strip() if soup.find('h1') else "Unknown Title"
40
- faqs = []
41
- sections = soup.find_all(['div', 'section'], class_=['faq-item', 'faq-section'])
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- for section in sections:
44
- question = section.find(['h2', 'h3']).text.strip() if section.find(['h2', 'h3']) else None
45
- answer = section.find(['p']).text.strip() if section.find(['p']) else None
46
 
47
- if question and answer:
48
- faqs.append({"question": question, "answer": answer})
49
 
50
- return {"url": url, "title": faq_title, "faqs": faqs}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  except Exception as e:
52
- logger.error(f"Error fetching FAQ content from {url}: {e}")
53
- return None
 
54
 
55
  async def index_faqs(self):
56
  faq_pages = await self.fetch_faq_pages()
57
- faq_pages = [page for page in faq_pages if page]
58
 
59
  self.faq_data = []
60
  all_texts = []
61
-
62
  for faq_page in faq_pages:
63
  for item in faq_page['faqs']:
 
64
  combined_text = f"{item['question']} {item['answer']}"
65
  all_texts.append(combined_text)
66
  self.faq_data.append({
@@ -68,7 +133,12 @@ class FAQService:
68
  "answer": item['answer'],
69
  "source": faq_page['url']
70
  })
71
-
 
 
 
 
 
72
  embeddings = self.embedder.encode(all_texts, convert_to_tensor=True).cpu().detach().numpy()
73
  dimension = embeddings.shape[1]
74
  self.faiss_index = faiss.IndexFlatL2(dimension)
@@ -77,15 +147,19 @@ class FAQService:
77
  async def search_faqs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
78
  if not self.faiss_index:
79
  await self.index_faqs()
80
-
 
 
 
 
81
  query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
82
  distances, indices = self.faiss_index.search(query_embedding, top_k)
83
-
84
  results = []
85
  for i, idx in enumerate(indices[0]):
86
  if idx < len(self.faq_data):
87
  result = self.faq_data[idx].copy()
88
  result["score"] = float(distances[0][i])
89
  results.append(result)
90
-
91
  return results
 
 
1
  from typing import List, Dict, Any, Optional
2
  import aiohttp
3
  from bs4 import BeautifulSoup
4
  import faiss
5
  import logging
6
  from config.config import settings
7
+ import asyncio
8
+ from urllib.parse import urljoin
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
14
  self.embedder = model_service.embedder
15
  self.faiss_index = None
16
  self.faq_data = []
17
+ self.visited_urls = set()
18
+ self.base_url = "https://www.bofrost.de/faq/"
19
 
20
  async def fetch_faq_pages(self) -> List[Dict[str, Any]]:
21
  async with aiohttp.ClientSession() as session:
22
  try:
23
+ # Start with the main FAQ page
24
+ pages = await self.crawl_faq_pages(self.base_url, session)
25
+ return [page for page in pages if page]
 
 
 
 
 
26
  except Exception as e:
27
+ logger.error(f"Error fetching FAQ pages: {e}")
28
  return []
29
 
30
+ async def crawl_faq_pages(self, url: str, session: aiohttp.ClientSession) -> List[Dict[str, Any]]:
31
+ if url in self.visited_urls or not url.startswith(self.base_url):
32
+ return []
33
+
34
+ self.visited_urls.add(url)
35
+ pages = []
36
+
37
  try:
38
  async with session.get(url, timeout=settings.TIMEOUT) as response:
39
  if response.status == 200:
40
  content = await response.text()
41
  soup = BeautifulSoup(content, 'html.parser')
42
+
43
+ # Add current page content
44
+ page_content = await self.parse_faq_content(soup, url)
45
+ if page_content:
46
+ pages.append(page_content)
47
 
48
+ # Find and follow FAQ links
49
+ tasks = []
50
+ for link in soup.find_all('a', href=True):
51
+ href = link['href']
52
+ full_url = urljoin(url, href)
53
+
54
+ if (full_url.startswith(self.base_url) and
55
+ full_url not in self.visited_urls):
56
+ tasks.append(self.crawl_faq_pages(full_url, session))
57
+
58
+ if tasks:
59
+ results = await asyncio.gather(*tasks)
60
+ for result in results:
61
+ pages.extend(result)
62
 
63
+ except Exception as e:
64
+ logger.error(f"Error crawling FAQ page {url}: {e}")
 
65
 
66
+ return pages
 
67
 
68
+ async def parse_faq_content(self, soup: BeautifulSoup, url: str) -> Optional[Dict[str, Any]]:
69
+ try:
70
+ faqs = []
71
+ faq_items = soup.find_all('div', class_='faq-item')
72
+
73
+ for item in faq_items:
74
+ # Extract question
75
+ question_elem = item.find('a', class_='headline-collapse')
76
+ if not question_elem:
77
+ continue
78
+
79
+ question = question_elem.find('span')
80
+ if not question:
81
+ continue
82
+
83
+ question_text = question.text.strip()
84
+
85
+ # Extract answer
86
+ content_elem = item.find('div', class_='content-collapse')
87
+ if not content_elem:
88
+ continue
89
+
90
+ wysiwyg = content_elem.find('div', class_='wysiwyg-content')
91
+ if not wysiwyg:
92
+ continue
93
+
94
+ # Extract all text while preserving structure
95
+ answer_parts = []
96
+ for elem in wysiwyg.find_all(['p', 'li']):
97
+ text = elem.get_text(strip=True)
98
+ if text:
99
+ answer_parts.append(text)
100
+
101
+ answer_text = ' '.join(answer_parts)
102
+
103
+ if question_text and answer_text:
104
+ faqs.append({
105
+ "question": question_text,
106
+ "answer": answer_text
107
+ })
108
+
109
+ if faqs:
110
+ return {
111
+ "url": url,
112
+ "faqs": faqs
113
+ }
114
+
115
  except Exception as e:
116
+ logger.error(f"Error parsing FAQ content from {url}: {e}")
117
+
118
+ return None
119
 
120
  async def index_faqs(self):
121
  faq_pages = await self.fetch_faq_pages()
 
122
 
123
  self.faq_data = []
124
  all_texts = []
125
+
126
  for faq_page in faq_pages:
127
  for item in faq_page['faqs']:
128
+ # Combine question and answer for better semantic search
129
  combined_text = f"{item['question']} {item['answer']}"
130
  all_texts.append(combined_text)
131
  self.faq_data.append({
 
133
  "answer": item['answer'],
134
  "source": faq_page['url']
135
  })
136
+
137
+ if not all_texts:
138
+ logger.warning("No FAQ content found to index")
139
+ return
140
+
141
+ # Create embeddings and index them
142
  embeddings = self.embedder.encode(all_texts, convert_to_tensor=True).cpu().detach().numpy()
143
  dimension = embeddings.shape[1]
144
  self.faiss_index = faiss.IndexFlatL2(dimension)
 
147
  async def search_faqs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
148
  if not self.faiss_index:
149
  await self.index_faqs()
150
+
151
+ if not self.faq_data:
152
+ logger.warning("No FAQ data available for search")
153
+ return []
154
+
155
  query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
156
  distances, indices = self.faiss_index.search(query_embedding, top_k)
157
+
158
  results = []
159
  for i, idx in enumerate(indices[0]):
160
  if idx < len(self.faq_data):
161
  result = self.faq_data[idx].copy()
162
  result["score"] = float(distances[0][i])
163
  results.append(result)
164
+
165
  return results