Chrunos commited on
Commit
792fd2e
·
verified ·
1 Parent(s): 21b9e88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -126
app.py CHANGED
@@ -2,19 +2,25 @@ import os
2
  import re
3
  import time
4
  import asyncio
 
5
  from typing import List, Optional, Dict, Any
6
  from urllib.parse import urlparse
7
- from fastapi import FastAPI, HTTPException, Query, Request
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import JSONResponse
10
  from pydantic import BaseModel
11
- from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 
 
 
 
 
12
  import uvicorn
13
 
14
  app = FastAPI(
15
- title="Threads Media Extractor API - Playwright",
16
- description="Fast extraction of media URLs from Threads posts using Playwright",
17
- version="3.0.0"
18
  )
19
 
20
  # Add CORS middleware
@@ -26,8 +32,9 @@ app.add_middleware(
26
  allow_headers=["*"],
27
  )
28
 
29
- # Global browser instance for reuse
30
- browser: Optional[Browser] = None
 
31
 
32
  class ThreadsResponse(BaseModel):
33
  url: str
@@ -42,32 +49,68 @@ class ErrorResponse(BaseModel):
42
  error: str
43
  success: bool = False
44
 
45
- async def get_browser() -> Browser:
46
- """Get or create browser instance"""
47
- global browser
48
- if browser is None:
49
- playwright = await async_playwright().start()
50
- browser = await playwright.chromium.launch(
51
- headless=True,
52
- args=[
53
- '--no-sandbox',
54
- '--disable-dev-shm-usage',
55
- '--disable-gpu',
56
- '--disable-extensions',
57
- '--disable-default-apps',
58
- '--disable-background-timer-throttling',
59
- '--disable-backgrounding-occluded-windows',
60
- '--disable-renderer-backgrounding',
61
- '--disable-features=TranslateUI',
62
- '--memory-pressure-off',
63
- '--window-size=1280,720',
64
- '--disable-background-networking',
65
- '--disable-sync',
66
- '--disable-plugins',
67
- '--disable-images', # Speed up by not loading images initially
68
- ]
69
- )
70
- return browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def extract_post_id_from_url(url: str) -> Optional[str]:
73
  """Extract post ID from Threads URL"""
@@ -96,8 +139,8 @@ def is_valid_threads_url(url: str) -> bool:
96
  except:
97
  return False
98
 
99
- async def extract_media_playwright(page: Page, url: str) -> Dict[str, Any]:
100
- """Extract media URLs using Playwright"""
101
  media_urls = []
102
  post_text = None
103
  author = None
@@ -105,74 +148,54 @@ async def extract_media_playwright(page: Page, url: str) -> Dict[str, Any]:
105
  try:
106
  start_time = time.time()
107
 
108
- # Navigate to the URL with optimized loading
109
- await page.goto(url, wait_until='domcontentloaded', timeout=15000)
110
 
111
- # Wait a bit for dynamic content but not too long
112
- await asyncio.sleep(2)
 
 
 
 
 
 
 
 
113
 
114
- # Extract videos first (most important for Threads)
115
- video_elements = await page.query_selector_all('video')
116
  for video in video_elements:
117
- src = await video.get_attribute('src')
118
  if src and src.startswith('http'):
119
  media_urls.append(src)
120
 
121
- # Check source elements within video
122
- sources = await video.query_selector_all('source')
123
  for source in sources:
124
- src = await source.get_attribute('src')
125
  if src and src.startswith('http'):
126
  media_urls.append(src)
127
 
128
- # If no videos, look for images
129
  if not media_urls:
130
- img_elements = await page.query_selector_all('img')
131
- for img in img_elements[:10]: # Limit to first 10
132
- src = await img.get_attribute('src')
133
  if src and src.startswith('http') and any(ext in src.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
134
- # Filter out small images, profiles, etc.
135
  if not any(exclude in src.lower() for exclude in ['profile', 'avatar', 'icon', 'logo']):
136
  media_urls.append(src)
137
 
138
- # Try to extract post text (quick attempt)
139
  try:
140
- text_selectors = [
141
- 'div[role="article"] span',
142
- 'article span',
143
- '[data-testid="post-text"]'
144
- ]
145
-
146
- for selector in text_selectors:
147
- elements = await page.query_selector_all(selector)
148
- for element in elements[:3]: # Check only first 3
149
- text = await element.inner_text()
150
- if text and len(text.strip()) > 10:
151
- post_text = text.strip()
152
- break
153
- if post_text:
154
  break
155
  except:
156
  pass
157
 
158
- # Try to extract author (quick attempt)
159
- try:
160
- author_selectors = [
161
- 'a[role="link"] span',
162
- 'header a span',
163
- '[data-testid="user-name"]'
164
- ]
165
-
166
- for selector in author_selectors:
167
- element = await page.query_selector(selector)
168
- if element:
169
- author_text = await element.inner_text()
170
- if author_text and not author_text.startswith('@'):
171
- author = author_text.strip()
172
- break
173
- except:
174
- pass
175
-
176
  # Remove duplicates
177
  seen = set()
178
  unique_media_urls = []
@@ -193,26 +216,25 @@ async def extract_media_playwright(page: Page, url: str) -> Dict[str, Any]:
193
  except Exception as e:
194
  raise HTTPException(status_code=500, detail=f"Error extracting media: {str(e)}")
195
 
196
- @app.on_event("startup")
197
- async def startup_event():
198
- """Initialize browser on startup"""
199
- await get_browser()
200
-
201
- @app.on_event("shutdown")
202
- async def shutdown_event():
203
- """Clean up browser on shutdown"""
204
- global browser
205
- if browser:
206
- await browser.close()
207
 
208
  @app.get("/", response_model=Dict[str, str])
209
  async def root():
210
  """Root endpoint with API information"""
211
  return {
212
- "message": "Threads Media Extractor API v3.0 - Playwright",
213
- "description": "Fast extraction of media URLs from Threads posts using Playwright",
214
- "version": "3.0.0",
215
- "engine": "Playwright (faster than Selenium)",
216
  "endpoints": {
217
  "extract": "/extract?url=<threads_url>",
218
  "health": "/health"
@@ -222,19 +244,17 @@ async def root():
222
  @app.get("/health")
223
  async def health_check():
224
  """Health check endpoint"""
225
- global browser
226
  return {
227
  "status": "healthy",
228
  "service": "threads-media-extractor",
229
- "version": "3.0.0",
230
- "engine": "playwright",
231
- "browser_ready": browser is not None
232
  }
233
 
234
  @app.get("/extract", response_model=ThreadsResponse)
235
  async def extract_media(url: str = Query(..., description="Threads post URL")):
236
  """
237
- Extract media URLs from a Threads post using Playwright
238
 
239
  Args:
240
  url: The Threads post URL to extract media from
@@ -256,37 +276,35 @@ async def extract_media(url: str = Query(..., description="Threads post URL")):
256
  raise HTTPException(status_code=400, detail="Could not extract post ID from URL")
257
 
258
  try:
259
- browser = await get_browser()
 
 
260
 
261
- # Create a new context for each request (isolation)
262
- context = await browser.new_context(
263
- user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
264
- viewport={'width': 1280, 'height': 720}
 
 
 
 
265
  )
266
 
267
- page = await context.new_page()
268
-
269
- try:
270
- # Extract media URLs and metadata
271
- extracted_data = await extract_media_playwright(page, url)
272
-
273
- return ThreadsResponse(
274
- url=url,
275
- media_urls=extracted_data["media_urls"],
276
- media_count=len(extracted_data["media_urls"]),
277
- post_text=extracted_data["post_text"],
278
- author=extracted_data["author"],
279
- success=True,
280
- processing_time=extracted_data.get("processing_time")
281
- )
282
- finally:
283
- await context.close()
284
-
285
  except HTTPException:
286
  raise
287
  except Exception as e:
288
  raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
289
 
 
 
 
 
 
 
 
 
 
 
290
  @app.exception_handler(HTTPException)
291
  async def http_exception_handler(request: Request, exc: HTTPException):
292
  """Custom HTTP exception handler"""
 
2
  import re
3
  import time
4
  import asyncio
5
+ from concurrent.futures import ThreadPoolExecutor
6
  from typing import List, Optional, Dict, Any
7
  from urllib.parse import urlparse
8
+ from fastapi import FastAPI, HTTPException, Query, Request, BackgroundTasks
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.responses import JSONResponse
11
  from pydantic import BaseModel
12
+ from selenium import webdriver
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+ from selenium.webdriver.support import expected_conditions as EC
16
+ from selenium.webdriver.chrome.options import Options
17
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
18
  import uvicorn
19
 
20
  app = FastAPI(
21
+ title="Threads Media Extractor API",
22
+ description="Extract media URLs from Threads posts - Optimized version",
23
+ version="2.1.0"
24
  )
25
 
26
  # Add CORS middleware
 
32
  allow_headers=["*"],
33
  )
34
 
35
+ # Global driver pool for reuse
36
+ driver_pool = []
37
+ executor = ThreadPoolExecutor(max_workers=2)
38
 
39
  class ThreadsResponse(BaseModel):
40
  url: str
 
49
  error: str
50
  success: bool = False
51
 
52
+ def create_optimized_driver():
53
+ """Create and configure optimized Chrome WebDriver"""
54
+ options = Options()
55
+ options.add_argument('--headless=new') # Use new headless mode
56
+ options.add_argument('--no-sandbox')
57
+ options.add_argument('--disable-dev-shm-usage')
58
+ options.add_argument('--disable-gpu')
59
+ options.add_argument('--disable-extensions')
60
+ options.add_argument('--disable-plugins')
61
+ options.add_argument('--disable-default-apps')
62
+ options.add_argument('--disable-background-timer-throttling')
63
+ options.add_argument('--disable-backgrounding-occluded-windows')
64
+ options.add_argument('--disable-renderer-backgrounding')
65
+ options.add_argument('--disable-features=TranslateUI')
66
+ options.add_argument('--disable-ipc-flooding-protection')
67
+
68
+ # Performance optimizations
69
+ options.add_argument('--memory-pressure-off')
70
+ options.add_argument('--max_old_space_size=4096')
71
+ options.add_argument('--window-size=1280,720') # Smaller window
72
+
73
+ # Network optimizations
74
+ options.add_argument('--aggressive-cache-discard')
75
+ options.add_argument('--disable-background-networking')
76
+
77
+ # Disable unnecessary features
78
+ options.add_experimental_option('useAutomationExtension', False)
79
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
80
+ options.add_argument('--disable-blink-features=AutomationControlled')
81
+
82
+ # User agent
83
+ options.add_argument('--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
84
+
85
+ try:
86
+ driver = webdriver.Chrome(options=options)
87
+ driver.implicitly_wait(5) # Reduced wait time
88
+ driver.set_page_load_timeout(15) # Reduced timeout
89
+
90
+ # Optimize browser settings
91
+ driver.execute_cdp_cmd('Network.setUserAgentOverride', {
92
+ "userAgent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
93
+ })
94
+
95
+ return driver
96
+ except Exception as e:
97
+ raise HTTPException(status_code=500, detail=f"Failed to create browser driver: {str(e)}")
98
+
99
+ def get_driver():
100
+ """Get driver from pool or create new one"""
101
+ if driver_pool:
102
+ return driver_pool.pop()
103
+ return create_optimized_driver()
104
+
105
+ def return_driver(driver):
106
+ """Return driver to pool for reuse"""
107
+ if len(driver_pool) < 2: # Keep max 2 drivers in pool
108
+ driver_pool.append(driver)
109
+ else:
110
+ try:
111
+ driver.quit()
112
+ except:
113
+ pass
114
 
115
  def extract_post_id_from_url(url: str) -> Optional[str]:
116
  """Extract post ID from Threads URL"""
 
139
  except:
140
  return False
141
 
142
+ def fast_extract_media(driver: webdriver.Chrome, url: str) -> Dict[str, Any]:
143
+ """Optimized media extraction with faster loading"""
144
  media_urls = []
145
  post_text = None
146
  author = None
 
148
  try:
149
  start_time = time.time()
150
 
151
+ # Navigate to the URL
152
+ driver.get(url)
153
 
154
+ # Wait for essential elements only
155
+ try:
156
+ WebDriverWait(driver, 8).until(
157
+ lambda d: d.execute_script("return document.readyState") == "complete"
158
+ )
159
+ except TimeoutException:
160
+ pass # Continue even if timeout
161
+
162
+ # Quick wait for dynamic content
163
+ time.sleep(1.5) # Reduced from 3 seconds
164
 
165
+ # Extract videos first (most important)
166
+ video_elements = driver.find_elements(By.TAG_NAME, 'video')
167
  for video in video_elements:
168
+ src = video.get_attribute('src')
169
  if src and src.startswith('http'):
170
  media_urls.append(src)
171
 
172
+ # Check source elements
173
+ sources = video.find_elements(By.TAG_NAME, 'source')
174
  for source in sources:
175
+ src = source.get_attribute('src')
176
  if src and src.startswith('http'):
177
  media_urls.append(src)
178
 
179
+ # If no videos found, look for images quickly
180
  if not media_urls:
181
+ img_elements = driver.find_elements(By.TAG_NAME, 'img')[:10] # Limit to first 10 images
182
+ for img in img_elements:
183
+ src = img.get_attribute('src')
184
  if src and src.startswith('http') and any(ext in src.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
 
185
  if not any(exclude in src.lower() for exclude in ['profile', 'avatar', 'icon', 'logo']):
186
  media_urls.append(src)
187
 
188
+ # Quick text extraction (optional, skip if taking too long)
189
  try:
190
+ text_elements = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"] span, article span')[:5]
191
+ for element in text_elements:
192
+ text = element.text.strip()
193
+ if text and len(text) > 10 and not post_text:
194
+ post_text = text
 
 
 
 
 
 
 
 
 
195
  break
196
  except:
197
  pass
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Remove duplicates
200
  seen = set()
201
  unique_media_urls = []
 
216
  except Exception as e:
217
  raise HTTPException(status_code=500, detail=f"Error extracting media: {str(e)}")
218
 
219
+ def extract_media_sync(url: str) -> Dict[str, Any]:
220
+ """Synchronous wrapper for thread execution"""
221
+ driver = None
222
+ try:
223
+ driver = get_driver()
224
+ result = fast_extract_media(driver, url)
225
+ return result
226
+ finally:
227
+ if driver:
228
+ return_driver(driver)
 
229
 
230
  @app.get("/", response_model=Dict[str, str])
231
  async def root():
232
  """Root endpoint with API information"""
233
  return {
234
+ "message": "Threads Media Extractor API v2.1 - Optimized",
235
+ "description": "Fast extraction of media URLs from Threads posts",
236
+ "version": "2.1.0",
237
+ "optimization": "Driver pooling, reduced timeouts, focused extraction",
238
  "endpoints": {
239
  "extract": "/extract?url=<threads_url>",
240
  "health": "/health"
 
244
  @app.get("/health")
245
  async def health_check():
246
  """Health check endpoint"""
 
247
  return {
248
  "status": "healthy",
249
  "service": "threads-media-extractor",
250
+ "version": "2.1.0",
251
+ "driver_pool_size": len(driver_pool)
 
252
  }
253
 
254
  @app.get("/extract", response_model=ThreadsResponse)
255
  async def extract_media(url: str = Query(..., description="Threads post URL")):
256
  """
257
+ Extract media URLs from a Threads post - Optimized version
258
 
259
  Args:
260
  url: The Threads post URL to extract media from
 
276
  raise HTTPException(status_code=400, detail="Could not extract post ID from URL")
277
 
278
  try:
279
+ # Run extraction in thread pool for better async handling
280
+ loop = asyncio.get_event_loop()
281
+ extracted_data = await loop.run_in_executor(executor, extract_media_sync, url)
282
 
283
+ return ThreadsResponse(
284
+ url=url,
285
+ media_urls=extracted_data["media_urls"],
286
+ media_count=len(extracted_data["media_urls"]),
287
+ post_text=extracted_data["post_text"],
288
+ author=extracted_data["author"],
289
+ success=True,
290
+ processing_time=extracted_data.get("processing_time")
291
  )
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  except HTTPException:
294
  raise
295
  except Exception as e:
296
  raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
297
 
298
+ @app.on_event("shutdown")
299
+ async def shutdown_event():
300
+ """Clean up resources on shutdown"""
301
+ executor.shutdown(wait=False)
302
+ for driver in driver_pool:
303
+ try:
304
+ driver.quit()
305
+ except:
306
+ pass
307
+
308
  @app.exception_handler(HTTPException)
309
  async def http_exception_handler(request: Request, exc: HTTPException):
310
  """Custom HTTP exception handler"""