Update webscout.py
Browse files- webscout.py +189 -139
webscout.py
CHANGED
@@ -1176,145 +1176,229 @@ class WEBS:
|
|
1176 |
return results
|
1177 |
import requests
|
1178 |
import http.cookiejar as cookiejar
|
1179 |
-
import sys
|
1180 |
import json
|
1181 |
from xml.etree import ElementTree
|
1182 |
import re
|
1183 |
-
from requests import HTTPError
|
1184 |
import html.parser
|
|
|
1185 |
|
1186 |
html_parser = html.parser.HTMLParser()
|
1187 |
-
|
1188 |
|
1189 |
def unescape(string):
|
1190 |
return html.unescape(string)
|
1191 |
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
1192 |
|
1193 |
-
class TranscriptRetrievalError(Exception):
|
1194 |
-
"""
|
1195 |
-
Base class for exceptions raised when a transcript cannot be retrieved.
|
1196 |
-
"""
|
1197 |
-
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
|
1198 |
-
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
|
1199 |
-
CAUSE_MESSAGE = ''
|
1200 |
-
GITHUB_REFERRAL = (
|
1201 |
-
'\n\nIf you are sure that the described cause is not responsible for this error '
|
1202 |
-
'and that a transcript should be retrievable, please create an issue at '
|
1203 |
-
'https://github.com/OE-LUCIFER/Webscout/issues. '
|
1204 |
-
'Please add which version of webscout you are using '
|
1205 |
-
'and provide the information needed to replicate the error. '
|
1206 |
-
)
|
1207 |
|
1208 |
-
|
1209 |
-
self.video_id = video_id
|
1210 |
-
super(TranscriptRetrievalError, self).__init__(self._build_error_message())
|
1211 |
|
1212 |
-
def _build_error_message(self):
|
1213 |
-
cause = self.cause
|
1214 |
-
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
|
1215 |
|
1216 |
-
|
1217 |
-
|
1218 |
|
1219 |
-
|
|
|
|
|
1220 |
|
1221 |
-
@property
|
1222 |
-
def cause(self):
|
1223 |
-
return self.CAUSE_MESSAGE
|
1224 |
|
1225 |
class YouTubeRequestFailedError(TranscriptRetrievalError):
|
1226 |
-
|
1227 |
|
1228 |
def __init__(self, video_id, http_error):
|
1229 |
-
|
1230 |
-
super(
|
1231 |
|
1232 |
-
@property
|
1233 |
-
def cause(self):
|
1234 |
-
return self.CAUSE_MESSAGE.format(reason=self.reason)
|
1235 |
|
1236 |
class VideoUnavailableError(TranscriptRetrievalError):
|
1237 |
-
|
|
|
|
|
|
|
|
|
|
|
1238 |
|
1239 |
class InvalidVideoIdError(TranscriptRetrievalError):
|
1240 |
-
|
1241 |
-
|
1242 |
-
|
1243 |
-
|
1244 |
-
|
|
|
|
|
|
|
|
|
|
|
1245 |
|
1246 |
class TooManyRequestsError(TranscriptRetrievalError):
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
-
|
1252 |
-
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
|
|
|
|
|
|
1256 |
|
1257 |
class TranscriptsDisabledError(TranscriptRetrievalError):
|
1258 |
-
|
|
|
|
|
|
|
|
|
|
|
1259 |
|
1260 |
class NoTranscriptAvailableError(TranscriptRetrievalError):
|
1261 |
-
|
|
|
|
|
|
|
|
|
|
|
1262 |
|
1263 |
class NotTranslatableError(TranscriptRetrievalError):
|
1264 |
-
|
|
|
|
|
|
|
|
|
|
|
1265 |
|
1266 |
class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
|
1267 |
-
|
|
|
|
|
|
|
|
|
|
|
1268 |
|
1269 |
class CookiePathInvalidError(TranscriptRetrievalError):
|
1270 |
-
|
|
|
|
|
|
|
|
|
|
|
1271 |
|
1272 |
class CookiesInvalidError(TranscriptRetrievalError):
|
1273 |
-
|
|
|
|
|
|
|
|
|
|
|
1274 |
|
1275 |
class FailedToCreateConsentCookieError(TranscriptRetrievalError):
|
1276 |
-
|
|
|
|
|
|
|
|
|
|
|
1277 |
|
1278 |
class NoTranscriptFoundError(TranscriptRetrievalError):
|
1279 |
-
|
1280 |
-
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
1281 |
-
'{transcript_data}'
|
1282 |
-
)
|
1283 |
|
1284 |
def __init__(self, video_id, requested_language_codes, transcript_data):
|
1285 |
-
|
1286 |
-
|
1287 |
-
|
1288 |
-
|
1289 |
-
@property
|
1290 |
-
def cause(self):
|
1291 |
-
return self.CAUSE_MESSAGE.format(
|
1292 |
-
requested_language_codes=self._requested_language_codes,
|
1293 |
-
transcript_data=str(self._transcript_data),
|
1294 |
)
|
|
|
|
|
|
|
|
|
1295 |
|
1296 |
|
|
|
|
|
|
|
|
|
1297 |
|
1298 |
-
|
1299 |
-
|
1300 |
-
|
1301 |
-
|
1302 |
-
|
1303 |
-
|
|
|
1304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1305 |
|
1306 |
-
|
1307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1308 |
self._http_client = http_client
|
1309 |
|
1310 |
-
def fetch(self, video_id):
|
|
|
1311 |
return TranscriptList.build(
|
1312 |
self._http_client,
|
1313 |
video_id,
|
1314 |
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
1315 |
)
|
1316 |
|
1317 |
-
def _extract_captions_json(self, html, video_id):
|
|
|
1318 |
splitted_html = html.split('"captions":')
|
1319 |
|
1320 |
if len(splitted_html) <= 1:
|
@@ -1358,11 +1442,8 @@ class TranscriptListFetcher(object):
|
|
1358 |
return unescape(_raise_http_errors(response, video_id).text)
|
1359 |
|
1360 |
|
1361 |
-
class TranscriptList
|
1362 |
-
"""
|
1363 |
-
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
1364 |
-
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
1365 |
-
"""
|
1366 |
|
1367 |
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
1368 |
"""
|
@@ -1434,18 +1515,18 @@ class TranscriptList(object):
|
|
1434 |
|
1435 |
def find_transcript(self, language_codes):
|
1436 |
"""
|
1437 |
-
Finds a transcript for a given language code.
|
1438 |
-
|
1439 |
-
`find_manually_created_transcript` instead.
|
1440 |
|
1441 |
-
:param language_codes: A list of language codes in a descending priority.
|
1442 |
-
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
1443 |
-
it fails to do so.
|
1444 |
:type languages: list[str]
|
1445 |
:return: the found Transcript
|
1446 |
:rtype Transcript:
|
1447 |
:raises: NoTranscriptFound
|
1448 |
"""
|
|
|
|
|
|
|
1449 |
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
1450 |
|
1451 |
def find_generated_transcript(self, language_codes):
|
@@ -1460,6 +1541,10 @@ class TranscriptList(object):
|
|
1460 |
:rtype Transcript:
|
1461 |
:raises: NoTranscriptFound
|
1462 |
"""
|
|
|
|
|
|
|
|
|
1463 |
return self._find_transcript(language_codes, [self._generated_transcripts])
|
1464 |
|
1465 |
def find_manually_created_transcript(self, language_codes):
|
@@ -1518,7 +1603,9 @@ class TranscriptList(object):
|
|
1518 |
return description if description else 'None'
|
1519 |
|
1520 |
|
1521 |
-
class Transcript
|
|
|
|
|
1522 |
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
1523 |
"""
|
1524 |
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
@@ -1555,7 +1642,7 @@ class Transcript(object):
|
|
1555 |
:rtype [{'text': str, 'start': float, 'end': float}]:
|
1556 |
"""
|
1557 |
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
|
1558 |
-
return
|
1559 |
_raise_http_errors(response, self.video_id).text,
|
1560 |
)
|
1561 |
|
@@ -1588,7 +1675,8 @@ class Transcript(object):
|
|
1588 |
)
|
1589 |
|
1590 |
|
1591 |
-
class
|
|
|
1592 |
_FORMATTING_TAGS = [
|
1593 |
'strong', # important
|
1594 |
'em', # emphasized
|
@@ -1625,52 +1713,14 @@ class _TranscriptParser(object):
|
|
1625 |
if xml_element.text is not None
|
1626 |
]
|
1627 |
|
1628 |
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
1629 |
-
|
1630 |
-
class transcriber(object):
|
1631 |
-
@classmethod
|
1632 |
-
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
1633 |
-
with requests.Session() as http_client:
|
1634 |
-
if cookies:
|
1635 |
-
http_client.cookies = cls._load_cookies(cookies, video_id)
|
1636 |
-
http_client.proxies = proxies if proxies else {}
|
1637 |
-
return TranscriptListFetcher(http_client).fetch(video_id)
|
1638 |
-
|
1639 |
-
@classmethod
|
1640 |
-
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
|
1641 |
-
cookies=None, preserve_formatting=False):
|
1642 |
-
|
1643 |
-
assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
|
1644 |
-
|
1645 |
-
data = {}
|
1646 |
-
unretrievable_videos = []
|
1647 |
-
|
1648 |
-
for video_id in video_ids:
|
1649 |
-
try:
|
1650 |
-
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
|
1651 |
-
except Exception as exception:
|
1652 |
-
if not continue_after_error:
|
1653 |
-
raise exception
|
1654 |
-
|
1655 |
-
unretrievable_videos.append(video_id)
|
1656 |
|
1657 |
-
|
1658 |
-
|
1659 |
-
|
1660 |
-
|
1661 |
-
|
1662 |
-
|
1663 |
|
1664 |
-
@classmethod
|
1665 |
-
def _load_cookies(cls, cookies, video_id):
|
1666 |
-
try:
|
1667 |
-
cookie_jar = cookiejar.MozillaCookieJar()
|
1668 |
-
cookie_jar.load(cookies)
|
1669 |
-
if not cookie_jar:
|
1670 |
-
raise CookiesInvalidError(video_id)
|
1671 |
-
return cookie_jar
|
1672 |
-
except:
|
1673 |
-
raise CookiePathInvalidError(video_id)
|
1674 |
|
1675 |
class LLM:
|
1676 |
def __init__(self, model: str, system_message: str = "You are a Helpful AI."):
|
|
|
1176 |
return results
|
1177 |
import requests
|
1178 |
import http.cookiejar as cookiejar
|
|
|
1179 |
import json
|
1180 |
from xml.etree import ElementTree
|
1181 |
import re
|
|
|
1182 |
import html.parser
|
1183 |
+
from typing import List, Dict, Union, Optional
|
1184 |
|
1185 |
html_parser = html.parser.HTMLParser()
|
1186 |
+
|
1187 |
|
1188 |
def unescape(string):
|
1189 |
return html.unescape(string)
|
|
|
1190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1191 |
|
1192 |
+
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
|
|
|
1193 |
|
|
|
|
|
|
|
1194 |
|
1195 |
+
class TranscriptRetrievalError(Exception):
|
1196 |
+
"""Base class for transcript retrieval errors."""
|
1197 |
|
1198 |
+
def __init__(self, video_id, message):
|
1199 |
+
super().__init__(message.format(video_url=WATCH_URL.format(video_id=video_id)))
|
1200 |
+
self.video_id = video_id
|
1201 |
|
|
|
|
|
|
|
1202 |
|
1203 |
class YouTubeRequestFailedError(TranscriptRetrievalError):
|
1204 |
+
"""Raised when a request to YouTube fails."""
|
1205 |
|
1206 |
def __init__(self, video_id, http_error):
|
1207 |
+
message = 'Request to YouTube failed: {reason}'
|
1208 |
+
super().__init__(video_id, message.format(reason=str(http_error)))
|
1209 |
|
|
|
|
|
|
|
1210 |
|
1211 |
class VideoUnavailableError(TranscriptRetrievalError):
|
1212 |
+
"""Raised when the video is unavailable."""
|
1213 |
+
|
1214 |
+
def __init__(self, video_id):
|
1215 |
+
message = 'The video is no longer available'
|
1216 |
+
super().__init__(video_id, message)
|
1217 |
+
|
1218 |
|
1219 |
class InvalidVideoIdError(TranscriptRetrievalError):
|
1220 |
+
"""Raised when an invalid video ID is provided."""
|
1221 |
+
|
1222 |
+
def __init__(self, video_id):
|
1223 |
+
message = (
|
1224 |
+
'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
|
1225 |
+
'Do NOT run: `YTTranscriber.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
|
1226 |
+
'Instead run: `YTTranscriber.get_transcript("1234")`'
|
1227 |
+
)
|
1228 |
+
super().__init__(video_id, message)
|
1229 |
+
|
1230 |
|
1231 |
class TooManyRequestsError(TranscriptRetrievalError):
|
1232 |
+
"""Raised when YouTube rate limits the requests."""
|
1233 |
+
|
1234 |
+
def __init__(self, video_id):
|
1235 |
+
message = (
|
1236 |
+
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
|
1237 |
+
'One of the following things can be done to work around this:\n\
|
1238 |
+
- Manually solve the captcha in a browser and export the cookie. '
|
1239 |
+
'- Use a different IP address\n\
|
1240 |
+
- Wait until the ban on your IP has been lifted'
|
1241 |
+
)
|
1242 |
+
super().__init__(video_id, message)
|
1243 |
+
|
1244 |
|
1245 |
class TranscriptsDisabledError(TranscriptRetrievalError):
|
1246 |
+
"""Raised when transcripts are disabled for the video."""
|
1247 |
+
|
1248 |
+
def __init__(self, video_id):
|
1249 |
+
message = 'Subtitles are disabled for this video'
|
1250 |
+
super().__init__(video_id, message)
|
1251 |
+
|
1252 |
|
1253 |
class NoTranscriptAvailableError(TranscriptRetrievalError):
|
1254 |
+
"""Raised when no transcripts are available for the video."""
|
1255 |
+
|
1256 |
+
def __init__(self, video_id):
|
1257 |
+
message = 'No transcripts are available for this video'
|
1258 |
+
super().__init__(video_id, message)
|
1259 |
+
|
1260 |
|
1261 |
class NotTranslatableError(TranscriptRetrievalError):
|
1262 |
+
"""Raised when the transcript is not translatable."""
|
1263 |
+
|
1264 |
+
def __init__(self, video_id):
|
1265 |
+
message = 'The requested language is not translatable'
|
1266 |
+
super().__init__(video_id, message)
|
1267 |
+
|
1268 |
|
1269 |
class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
|
1270 |
+
"""Raised when the requested translation language is not available."""
|
1271 |
+
|
1272 |
+
def __init__(self, video_id):
|
1273 |
+
message = 'The requested translation language is not available'
|
1274 |
+
super().__init__(video_id, message)
|
1275 |
+
|
1276 |
|
1277 |
class CookiePathInvalidError(TranscriptRetrievalError):
|
1278 |
+
"""Raised when the cookie path is invalid."""
|
1279 |
+
|
1280 |
+
def __init__(self, video_id):
|
1281 |
+
message = 'The provided cookie file was unable to be loaded'
|
1282 |
+
super().__init__(video_id, message)
|
1283 |
+
|
1284 |
|
1285 |
class CookiesInvalidError(TranscriptRetrievalError):
|
1286 |
+
"""Raised when the provided cookies are invalid."""
|
1287 |
+
|
1288 |
+
def __init__(self, video_id):
|
1289 |
+
message = 'The cookies provided are not valid (may have expired)'
|
1290 |
+
super().__init__(video_id, message)
|
1291 |
+
|
1292 |
|
1293 |
class FailedToCreateConsentCookieError(TranscriptRetrievalError):
|
1294 |
+
"""Raised when consent cookie creation fails."""
|
1295 |
+
|
1296 |
+
def __init__(self, video_id):
|
1297 |
+
message = 'Failed to automatically give consent to saving cookies'
|
1298 |
+
super().__init__(video_id, message)
|
1299 |
+
|
1300 |
|
1301 |
class NoTranscriptFoundError(TranscriptRetrievalError):
|
1302 |
+
"""Raised when no transcript is found for the requested language codes."""
|
|
|
|
|
|
|
1303 |
|
1304 |
def __init__(self, video_id, requested_language_codes, transcript_data):
|
1305 |
+
message = (
|
1306 |
+
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
1307 |
+
'{transcript_data}'
|
|
|
|
|
|
|
|
|
|
|
|
|
1308 |
)
|
1309 |
+
super().__init__(video_id, message.format(
|
1310 |
+
requested_language_codes=requested_language_codes,
|
1311 |
+
transcript_data=str(transcript_data)
|
1312 |
+
))
|
1313 |
|
1314 |
|
1315 |
+
class YTTranscriber:
|
1316 |
+
"""
|
1317 |
+
Main class for retrieving YouTube transcripts.
|
1318 |
+
"""
|
1319 |
|
1320 |
+
@staticmethod
|
1321 |
+
def get_transcript(video_url: str, languages: Optional[str] = 'en',
|
1322 |
+
proxies: Dict[str, str] = None,
|
1323 |
+
cookies: str = None,
|
1324 |
+
preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
|
1325 |
+
"""
|
1326 |
+
Retrieves the transcript for a given YouTube video URL.
|
1327 |
|
1328 |
+
Args:
|
1329 |
+
video_url (str): YouTube video URL (supports various formats).
|
1330 |
+
languages (str, optional): Language code for the transcript.
|
1331 |
+
If None, fetches the auto-generated transcript.
|
1332 |
+
Defaults to 'en'.
|
1333 |
+
proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
|
1334 |
+
cookies (str, optional): Path to the cookie file. Defaults to None.
|
1335 |
+
preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
|
1336 |
|
1337 |
+
Returns:
|
1338 |
+
List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
|
1339 |
+
- 'text': The transcribed text.
|
1340 |
+
- 'start': The start time of the text segment (in seconds).
|
1341 |
+
- 'duration': The duration of the text segment (in seconds).
|
1342 |
+
|
1343 |
+
Raises:
|
1344 |
+
TranscriptRetrievalError: If there's an error retrieving the transcript.
|
1345 |
+
"""
|
1346 |
+
video_id = YTTranscriber._extract_video_id(video_url)
|
1347 |
+
|
1348 |
+
with requests.Session() as http_client:
|
1349 |
+
if cookies:
|
1350 |
+
http_client.cookies = YTTranscriber._load_cookies(cookies, video_id)
|
1351 |
+
http_client.proxies = proxies if proxies else {}
|
1352 |
+
transcript_list_fetcher = TranscriptListFetcher(http_client)
|
1353 |
+
transcript_list = transcript_list_fetcher.fetch(video_id)
|
1354 |
+
|
1355 |
+
if languages is None: # Get auto-generated transcript
|
1356 |
+
return transcript_list.find_generated_transcript(['any']).fetch(
|
1357 |
+
preserve_formatting=preserve_formatting)
|
1358 |
+
else:
|
1359 |
+
return transcript_list.find_transcript([languages]).fetch(preserve_formatting=preserve_formatting)
|
1360 |
+
|
1361 |
+
@staticmethod
|
1362 |
+
def _extract_video_id(video_url: str) -> str:
|
1363 |
+
"""Extracts the video ID from different YouTube URL formats."""
|
1364 |
+
if 'youtube.com/watch?v=' in video_url:
|
1365 |
+
video_id = video_url.split('youtube.com/watch?v=')[1].split('&')[0]
|
1366 |
+
elif 'youtu.be/' in video_url:
|
1367 |
+
video_id = video_url.split('youtu.be/')[1].split('?')[0]
|
1368 |
+
else:
|
1369 |
+
raise InvalidVideoIdError(video_url)
|
1370 |
+
return video_id
|
1371 |
+
|
1372 |
+
@staticmethod
|
1373 |
+
def _load_cookies(cookies: str, video_id: str) -> cookiejar.MozillaCookieJar:
|
1374 |
+
"""Loads cookies from a file."""
|
1375 |
+
try:
|
1376 |
+
cookie_jar = cookiejar.MozillaCookieJar()
|
1377 |
+
cookie_jar.load(cookies)
|
1378 |
+
if not cookie_jar:
|
1379 |
+
raise CookiesInvalidError(video_id)
|
1380 |
+
return cookie_jar
|
1381 |
+
except:
|
1382 |
+
raise CookiePathInvalidError(video_id)
|
1383 |
+
|
1384 |
+
|
1385 |
+
class TranscriptListFetcher:
|
1386 |
+
"""Fetches the list of transcripts for a YouTube video."""
|
1387 |
+
|
1388 |
+
def __init__(self, http_client: requests.Session):
|
1389 |
+
"""Initializes TranscriptListFetcher."""
|
1390 |
self._http_client = http_client
|
1391 |
|
1392 |
+
def fetch(self, video_id: str):
|
1393 |
+
"""Fetches and returns a TranscriptList."""
|
1394 |
return TranscriptList.build(
|
1395 |
self._http_client,
|
1396 |
video_id,
|
1397 |
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
1398 |
)
|
1399 |
|
1400 |
+
def _extract_captions_json(self, html: str, video_id: str) -> dict:
|
1401 |
+
"""Extracts the captions JSON data from the video's HTML."""
|
1402 |
splitted_html = html.split('"captions":')
|
1403 |
|
1404 |
if len(splitted_html) <= 1:
|
|
|
1442 |
return unescape(_raise_http_errors(response, video_id).text)
|
1443 |
|
1444 |
|
1445 |
+
class TranscriptList:
|
1446 |
+
"""Represents a list of available transcripts."""
|
|
|
|
|
|
|
1447 |
|
1448 |
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
1449 |
"""
|
|
|
1515 |
|
1516 |
def find_transcript(self, language_codes):
|
1517 |
"""
|
1518 |
+
Finds a transcript for a given language code. If no language is provided, it will
|
1519 |
+
return the auto-generated transcript.
|
|
|
1520 |
|
1521 |
+
:param language_codes: A list of language codes in a descending priority.
|
|
|
|
|
1522 |
:type languages: list[str]
|
1523 |
:return: the found Transcript
|
1524 |
:rtype Transcript:
|
1525 |
:raises: NoTranscriptFound
|
1526 |
"""
|
1527 |
+
if 'any' in language_codes:
|
1528 |
+
for transcript in self:
|
1529 |
+
return transcript
|
1530 |
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
1531 |
|
1532 |
def find_generated_transcript(self, language_codes):
|
|
|
1541 |
:rtype Transcript:
|
1542 |
:raises: NoTranscriptFound
|
1543 |
"""
|
1544 |
+
if 'any' in language_codes:
|
1545 |
+
for transcript in self:
|
1546 |
+
if transcript.is_generated:
|
1547 |
+
return transcript
|
1548 |
return self._find_transcript(language_codes, [self._generated_transcripts])
|
1549 |
|
1550 |
def find_manually_created_transcript(self, language_codes):
|
|
|
1603 |
return description if description else 'None'
|
1604 |
|
1605 |
|
1606 |
+
class Transcript:
|
1607 |
+
"""Represents a single transcript."""
|
1608 |
+
|
1609 |
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
1610 |
"""
|
1611 |
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
|
|
1642 |
:rtype [{'text': str, 'start': float, 'end': float}]:
|
1643 |
"""
|
1644 |
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
|
1645 |
+
return TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
1646 |
_raise_http_errors(response, self.video_id).text,
|
1647 |
)
|
1648 |
|
|
|
1675 |
)
|
1676 |
|
1677 |
|
1678 |
+
class TranscriptParser:
|
1679 |
+
"""Parses the transcript data from XML."""
|
1680 |
_FORMATTING_TAGS = [
|
1681 |
'strong', # important
|
1682 |
'em', # emphasized
|
|
|
1713 |
if xml_element.text is not None
|
1714 |
]
|
1715 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1716 |
|
1717 |
+
def _raise_http_errors(response, video_id):
|
1718 |
+
try:
|
1719 |
+
response.raise_for_status()
|
1720 |
+
return response
|
1721 |
+
except requests.exceptions.HTTPError as error:
|
1722 |
+
raise YouTubeRequestFailedError(video_id, error)
|
1723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1724 |
|
1725 |
class LLM:
|
1726 |
def __init__(self, model: str, system_message: str = "You are a Helpful AI."):
|