Abhaykoul commited on
Commit
95fe448
1 Parent(s): 4b0e179

Update webscout.py

Browse files
Files changed (1) hide show
  1. webscout.py +572 -9
webscout.py CHANGED
@@ -20,14 +20,80 @@ try:
20
  except ImportError:
21
  LXML_AVAILABLE = False
22
 
23
- from .utils import (
24
- _calculate_distance,
25
- _extract_vqd,
26
- _normalize,
27
- _normalize_url,
28
- _text_extract_json,
29
- json_loads,
30
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  logger = logging.getLogger("webscout.WEBS")
33
 
@@ -1078,4 +1144,501 @@ class WEBS:
1078
  except Exception as e:
1079
  raise e
1080
 
1081
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  except ImportError:
21
  LXML_AVAILABLE = False
22
 
23
+ import re
24
+ from decimal import Decimal
25
+ from html import unescape
26
+ from math import atan2, cos, radians, sin, sqrt
27
+ from typing import Any, Dict, List, Union
28
+ from urllib.parse import unquote
29
+ import orjson
30
+
31
+ from .exceptions import WebscoutE
32
+
33
+ REGEX_STRIP_TAGS = re.compile("<.*?>")
34
+
35
+
36
+ def json_dumps(obj: Any) -> str:
37
+ try:
38
+ return orjson.dumps(obj).decode("utf-8")
39
+ except Exception as ex:
40
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
41
+
42
+
43
+ def json_loads(obj: Union[str, bytes]) -> Any:
44
+ try:
45
+ return orjson.loads(obj)
46
+ except Exception as ex:
47
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
48
+
49
+
50
+ def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
51
+ """Extract vqd from html bytes."""
52
+ for c1, c1_len, c2 in (
53
+ (b'vqd="', 5, b'"'),
54
+ (b"vqd=", 4, b"&"),
55
+ (b"vqd='", 5, b"'"),
56
+ ):
57
+ try:
58
+ start = html_bytes.index(c1) + c1_len
59
+ end = html_bytes.index(c2, start)
60
+ return html_bytes[start:end].decode()
61
+ except ValueError:
62
+ pass
63
+ raise WebscoutE(f"_extract_vqd() {keywords=} Could not extract vqd.")
64
+
65
+
66
+ def _text_extract_json(html_bytes: bytes, keywords: str) -> List[Dict[str, str]]:
67
+ """text(backend="api") -> extract json from html."""
68
+ try:
69
+ start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
70
+ end = html_bytes.index(b");DDG.duckbar.load(", start)
71
+ data = html_bytes[start:end]
72
+ result: List[Dict[str, str]] = json_loads(data)
73
+ return result
74
+ except Exception as ex:
75
+ raise WebscoutE(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
76
+ raise WebscoutE(f"_text_extract_json() {keywords=} return None")
77
+
78
+
79
+ def _normalize(raw_html: str) -> str:
80
+ """Strip HTML tags from the raw_html string."""
81
+ return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
82
+
83
+
84
+ def _normalize_url(url: str) -> str:
85
+ """Unquote URL and replace spaces with '+'."""
86
+ return unquote(url.replace(" ", "+")) if url else ""
87
+
88
+
89
+ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decimal) -> float:
90
+ """Calculate distance between two points in km. Haversine formula."""
91
+ R = 6371.0087714 # Earth's radius in km
92
+ rlat1, rlon1, rlat2, rlon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
93
+ dlon, dlat = rlon2 - rlon1, rlat2 - rlat1
94
+ a = sin(dlat / 2) ** 2 + cos(rlat1) * cos(rlat2) * sin(dlon / 2) ** 2
95
+ c = 2 * atan2(sqrt(a), sqrt(1 - a))
96
+ return R * c
97
 
98
  logger = logging.getLogger("webscout.WEBS")
99
 
 
1144
  except Exception as e:
1145
  raise e
1146
 
1147
+ return results
1148
+ import requests
1149
+ import http.cookiejar as cookiejar
1150
+ import sys
1151
+ import json
1152
+ from xml.etree import ElementTree
1153
+ import re
1154
+ from requests import HTTPError
1155
+ import html.parser
1156
+
1157
+ html_parser = html.parser.HTMLParser()
1158
+ import html
1159
+
1160
+ def unescape(string):
1161
+ return html.unescape(string)
1162
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
1163
+
1164
+ class TranscriptRetrievalError(Exception):
1165
+ """
1166
+ Base class for exceptions raised when a transcript cannot be retrieved.
1167
+ """
1168
+ ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
1169
+ CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
1170
+ CAUSE_MESSAGE = ''
1171
+ GITHUB_REFERRAL = (
1172
+ '\n\nIf you are sure that the described cause is not responsible for this error '
1173
+ 'and that a transcript should be retrievable, please create an issue at '
1174
+ 'https://github.com/OE-LUCIFER/Webscout/issues. '
1175
+ 'Please add which version of webscout you are using '
1176
+ 'and provide the information needed to replicate the error. '
1177
+ )
1178
+
1179
+ def __init__(self, video_id):
1180
+ self.video_id = video_id
1181
+ super(TranscriptRetrievalError, self).__init__(self._build_error_message())
1182
+
1183
+ def _build_error_message(self):
1184
+ cause = self.cause
1185
+ error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
1186
+
1187
+ if cause:
1188
+ error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
1189
+
1190
+ return error_message
1191
+
1192
+ @property
1193
+ def cause(self):
1194
+ return self.CAUSE_MESSAGE
1195
+
1196
+ class YouTubeRequestFailedError(TranscriptRetrievalError):
1197
+ CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
1198
+
1199
+ def __init__(self, video_id, http_error):
1200
+ self.reason = str(http_error)
1201
+ super(YouTubeRequestFailedError, self).__init__(video_id)
1202
+
1203
+ @property
1204
+ def cause(self):
1205
+ return self.CAUSE_MESSAGE.format(reason=self.reason)
1206
+
1207
+ class VideoUnavailableError(TranscriptRetrievalError):
1208
+ CAUSE_MESSAGE = 'The video is no longer available'
1209
+
1210
+ class InvalidVideoIdError(TranscriptRetrievalError):
1211
+ CAUSE_MESSAGE = (
1212
+ 'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
1213
+ 'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
1214
+ 'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`'
1215
+ )
1216
+
1217
+ class TooManyRequestsError(TranscriptRetrievalError):
1218
+ CAUSE_MESSAGE = (
1219
+ 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
1220
+ 'One of the following things can be done to work around this:\n\
1221
+ - Manually solve the captcha in a browser and export the cookie. '
1222
+ 'Read here how to use that cookie with '
1223
+ 'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
1224
+ - Use a different IP address\n\
1225
+ - Wait until the ban on your IP has been lifted'
1226
+ )
1227
+
1228
+ class TranscriptsDisabledError(TranscriptRetrievalError):
1229
+ CAUSE_MESSAGE = 'Subtitles are disabled for this video'
1230
+
1231
+ class NoTranscriptAvailableError(TranscriptRetrievalError):
1232
+ CAUSE_MESSAGE = 'No transcripts are available for this video'
1233
+
1234
+ class NotTranslatableError(TranscriptRetrievalError):
1235
+ CAUSE_MESSAGE = 'The requested language is not translatable'
1236
+
1237
+ class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
1238
+ CAUSE_MESSAGE = 'The requested translation language is not available'
1239
+
1240
+ class CookiePathInvalidError(TranscriptRetrievalError):
1241
+ CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
1242
+
1243
+ class CookiesInvalidError(TranscriptRetrievalError):
1244
+ CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
1245
+
1246
+ class FailedToCreateConsentCookieError(TranscriptRetrievalError):
1247
+ CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
1248
+
1249
+ class NoTranscriptFoundError(TranscriptRetrievalError):
1250
+ CAUSE_MESSAGE = (
1251
+ 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
1252
+ '{transcript_data}'
1253
+ )
1254
+
1255
+ def __init__(self, video_id, requested_language_codes, transcript_data):
1256
+ self._requested_language_codes = requested_language_codes
1257
+ self._transcript_data = transcript_data
1258
+ super(NoTranscriptFoundError, self).__init__(video_id)
1259
+
1260
+ @property
1261
+ def cause(self):
1262
+ return self.CAUSE_MESSAGE.format(
1263
+ requested_language_codes=self._requested_language_codes,
1264
+ transcript_data=str(self._transcript_data),
1265
+ )
1266
+
1267
+
1268
+
1269
+ def _raise_http_errors(response, video_id):
1270
+ try:
1271
+ response.raise_for_status()
1272
+ return response
1273
+ except HTTPError as error:
1274
+ raise YouTubeRequestFailedError(error, video_id)
1275
+
1276
+
1277
+ class TranscriptListFetcher(object):
1278
+ def __init__(self, http_client):
1279
+ self._http_client = http_client
1280
+
1281
+ def fetch(self, video_id):
1282
+ return TranscriptList.build(
1283
+ self._http_client,
1284
+ video_id,
1285
+ self._extract_captions_json(self._fetch_video_html(video_id), video_id),
1286
+ )
1287
+
1288
+ def _extract_captions_json(self, html, video_id):
1289
+ splitted_html = html.split('"captions":')
1290
+
1291
+ if len(splitted_html) <= 1:
1292
+ if video_id.startswith('http://') or video_id.startswith('https://'):
1293
+ raise InvalidVideoIdError(video_id)
1294
+ if 'class="g-recaptcha"' in html:
1295
+ raise TooManyRequestsError(video_id)
1296
+ if '"playabilityStatus":' not in html:
1297
+ raise VideoUnavailableError(video_id)
1298
+
1299
+ raise TranscriptsDisabledError(video_id)
1300
+
1301
+ captions_json = json.loads(
1302
+ splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
1303
+ ).get('playerCaptionsTracklistRenderer')
1304
+ if captions_json is None:
1305
+ raise TranscriptsDisabledError(video_id)
1306
+
1307
+ if 'captionTracks' not in captions_json:
1308
+ raise TranscriptsDisabledError(video_id)
1309
+
1310
+ return captions_json
1311
+
1312
+ def _create_consent_cookie(self, html, video_id):
1313
+ match = re.search('name="v" value="(.*?)"', html)
1314
+ if match is None:
1315
+ raise FailedToCreateConsentCookieError(video_id)
1316
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
1317
+
1318
+ def _fetch_video_html(self, video_id):
1319
+ html = self._fetch_html(video_id)
1320
+ if 'action="https://consent.youtube.com/s"' in html:
1321
+ self._create_consent_cookie(html, video_id)
1322
+ html = self._fetch_html(video_id)
1323
+ if 'action="https://consent.youtube.com/s"' in html:
1324
+ raise FailedToCreateConsentCookieError(video_id)
1325
+ return html
1326
+
1327
+ def _fetch_html(self, video_id):
1328
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
1329
+ return unescape(_raise_http_errors(response, video_id).text)
1330
+
1331
+
1332
+ class TranscriptList(object):
1333
+ """
1334
+ This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
1335
+ for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
1336
+ """
1337
+
1338
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
1339
+ """
1340
+ The constructor is only for internal use. Use the static build method instead.
1341
+
1342
+ :param video_id: the id of the video this TranscriptList is for
1343
+ :type video_id: str
1344
+ :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
1345
+ :type manually_created_transcripts: dict[str, Transcript]
1346
+ :param generated_transcripts: dict mapping language codes to the generated transcripts
1347
+ :type generated_transcripts: dict[str, Transcript]
1348
+ :param translation_languages: list of languages which can be used for translatable languages
1349
+ :type translation_languages: list[dict[str, str]]
1350
+ """
1351
+ self.video_id = video_id
1352
+ self._manually_created_transcripts = manually_created_transcripts
1353
+ self._generated_transcripts = generated_transcripts
1354
+ self._translation_languages = translation_languages
1355
+
1356
+ @staticmethod
1357
+ def build(http_client, video_id, captions_json):
1358
+ """
1359
+ Factory method for TranscriptList.
1360
+
1361
+ :param http_client: http client which is used to make the transcript retrieving http calls
1362
+ :type http_client: requests.Session
1363
+ :param video_id: the id of the video this TranscriptList is for
1364
+ :type video_id: str
1365
+ :param captions_json: the JSON parsed from the YouTube pages static HTML
1366
+ :type captions_json: dict
1367
+ :return: the created TranscriptList
1368
+ :rtype TranscriptList:
1369
+ """
1370
+ translation_languages = [
1371
+ {
1372
+ 'language': translation_language['languageName']['simpleText'],
1373
+ 'language_code': translation_language['languageCode'],
1374
+ } for translation_language in captions_json.get('translationLanguages', [])
1375
+ ]
1376
+
1377
+ manually_created_transcripts = {}
1378
+ generated_transcripts = {}
1379
+
1380
+ for caption in captions_json['captionTracks']:
1381
+ if caption.get('kind', '') == 'asr':
1382
+ transcript_dict = generated_transcripts
1383
+ else:
1384
+ transcript_dict = manually_created_transcripts
1385
+
1386
+ transcript_dict[caption['languageCode']] = Transcript(
1387
+ http_client,
1388
+ video_id,
1389
+ caption['baseUrl'],
1390
+ caption['name']['simpleText'],
1391
+ caption['languageCode'],
1392
+ caption.get('kind', '') == 'asr',
1393
+ translation_languages if caption.get('isTranslatable', False) else [],
1394
+ )
1395
+
1396
+ return TranscriptList(
1397
+ video_id,
1398
+ manually_created_transcripts,
1399
+ generated_transcripts,
1400
+ translation_languages,
1401
+ )
1402
+
1403
+ def __iter__(self):
1404
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
1405
+
1406
+ def find_transcript(self, language_codes):
1407
+ """
1408
+ Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
1409
+ are found, generated transcripts are used. If you only want generated transcripts use
1410
+ `find_manually_created_transcript` instead.
1411
+
1412
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1413
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1414
+ it fails to do so.
1415
+ :type languages: list[str]
1416
+ :return: the found Transcript
1417
+ :rtype Transcript:
1418
+ :raises: NoTranscriptFound
1419
+ """
1420
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
1421
+
1422
+ def find_generated_transcript(self, language_codes):
1423
+ """
1424
+ Finds an automatically generated transcript for a given language code.
1425
+
1426
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1427
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1428
+ it fails to do so.
1429
+ :type languages: list[str]
1430
+ :return: the found Transcript
1431
+ :rtype Transcript:
1432
+ :raises: NoTranscriptFound
1433
+ """
1434
+ return self._find_transcript(language_codes, [self._generated_transcripts])
1435
+
1436
+ def find_manually_created_transcript(self, language_codes):
1437
+ """
1438
+ Finds a manually created transcript for a given language code.
1439
+
1440
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1441
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1442
+ it fails to do so.
1443
+ :type languages: list[str]
1444
+ :return: the found Transcript
1445
+ :rtype Transcript:
1446
+ :raises: NoTranscriptFound
1447
+ """
1448
+ return self._find_transcript(language_codes, [self._manually_created_transcripts])
1449
+
1450
+ def _find_transcript(self, language_codes, transcript_dicts):
1451
+ for language_code in language_codes:
1452
+ for transcript_dict in transcript_dicts:
1453
+ if language_code in transcript_dict:
1454
+ return transcript_dict[language_code]
1455
+
1456
+ raise NoTranscriptFoundError(
1457
+ self.video_id,
1458
+ language_codes,
1459
+ self
1460
+ )
1461
+
1462
+ def __str__(self):
1463
+ return (
1464
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
1465
+ '(MANUALLY CREATED)\n'
1466
+ '{available_manually_created_transcript_languages}\n\n'
1467
+ '(GENERATED)\n'
1468
+ '{available_generated_transcripts}\n\n'
1469
+ '(TRANSLATION LANGUAGES)\n'
1470
+ '{available_translation_languages}'
1471
+ ).format(
1472
+ video_id=self.video_id,
1473
+ available_manually_created_transcript_languages=self._get_language_description(
1474
+ str(transcript) for transcript in self._manually_created_transcripts.values()
1475
+ ),
1476
+ available_generated_transcripts=self._get_language_description(
1477
+ str(transcript) for transcript in self._generated_transcripts.values()
1478
+ ),
1479
+ available_translation_languages=self._get_language_description(
1480
+ '{language_code} ("{language}")'.format(
1481
+ language=translation_language['language'],
1482
+ language_code=translation_language['language_code'],
1483
+ ) for translation_language in self._translation_languages
1484
+ )
1485
+ )
1486
+
1487
+ def _get_language_description(self, transcript_strings):
1488
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
1489
+ return description if description else 'None'
1490
+
1491
+
1492
+ class Transcript(object):
1493
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
1494
+ """
1495
+ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
1496
+ TranscriptList.
1497
+
1498
+ :param http_client: http client which is used to make the transcript retrieving http calls
1499
+ :type http_client: requests.Session
1500
+ :param video_id: the id of the video this TranscriptList is for
1501
+ :type video_id: str
1502
+ :param url: the url which needs to be called to fetch the transcript
1503
+ :param language: the name of the language this transcript uses
1504
+ :param language_code:
1505
+ :param is_generated:
1506
+ :param translation_languages:
1507
+ """
1508
+ self._http_client = http_client
1509
+ self.video_id = video_id
1510
+ self._url = url
1511
+ self.language = language
1512
+ self.language_code = language_code
1513
+ self.is_generated = is_generated
1514
+ self.translation_languages = translation_languages
1515
+ self._translation_languages_dict = {
1516
+ translation_language['language_code']: translation_language['language']
1517
+ for translation_language in translation_languages
1518
+ }
1519
+
1520
+ def fetch(self, preserve_formatting=False):
1521
+ """
1522
+ Loads the actual transcript data.
1523
+ :param preserve_formatting: whether to keep select HTML text formatting
1524
+ :type preserve_formatting: bool
1525
+ :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
1526
+ :rtype [{'text': str, 'start': float, 'end': float}]:
1527
+ """
1528
+ response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
1529
+ return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
1530
+ _raise_http_errors(response, self.video_id).text,
1531
+ )
1532
+
1533
+ def __str__(self):
1534
+ return '{language_code} ("{language}"){translation_description}'.format(
1535
+ language=self.language,
1536
+ language_code=self.language_code,
1537
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
1538
+ )
1539
+
1540
+ @property
1541
+ def is_translatable(self):
1542
+ return len(self.translation_languages) > 0
1543
+
1544
+ def translate(self, language_code):
1545
+ if not self.is_translatable:
1546
+ raise NotTranslatableError(self.video_id)
1547
+
1548
+ if language_code not in self._translation_languages_dict:
1549
+ raise TranslationLanguageNotAvailableError(self.video_id)
1550
+
1551
+ return Transcript(
1552
+ self._http_client,
1553
+ self.video_id,
1554
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
1555
+ self._translation_languages_dict[language_code],
1556
+ language_code,
1557
+ True,
1558
+ [],
1559
+ )
1560
+
1561
+
1562
+ class _TranscriptParser(object):
1563
+ _FORMATTING_TAGS = [
1564
+ 'strong', # important
1565
+ 'em', # emphasized
1566
+ 'b', # bold
1567
+ 'i', # italic
1568
+ 'mark', # marked
1569
+ 'small', # smaller
1570
+ 'del', # deleted
1571
+ 'ins', # inserted
1572
+ 'sub', # subscript
1573
+ 'sup', # superscript
1574
+ ]
1575
+
1576
+ def __init__(self, preserve_formatting=False):
1577
+ self._html_regex = self._get_html_regex(preserve_formatting)
1578
+
1579
+ def _get_html_regex(self, preserve_formatting):
1580
+ if preserve_formatting:
1581
+ formats_regex = '|'.join(self._FORMATTING_TAGS)
1582
+ formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
1583
+ html_regex = re.compile(formats_regex, re.IGNORECASE)
1584
+ else:
1585
+ html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
1586
+ return html_regex
1587
+
1588
+ def parse(self, plain_data):
1589
+ return [
1590
+ {
1591
+ 'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
1592
+ 'start': float(xml_element.attrib['start']),
1593
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
1594
+ }
1595
+ for xml_element in ElementTree.fromstring(plain_data)
1596
+ if xml_element.text is not None
1597
+ ]
1598
+
1599
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
1600
+
1601
+ class transcriber(object):
1602
+ @classmethod
1603
+ def list_transcripts(cls, video_id, proxies=None, cookies=None):
1604
+ with requests.Session() as http_client:
1605
+ if cookies:
1606
+ http_client.cookies = cls._load_cookies(cookies, video_id)
1607
+ http_client.proxies = proxies if proxies else {}
1608
+ return TranscriptListFetcher(http_client).fetch(video_id)
1609
+
1610
+ @classmethod
1611
+ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
1612
+ cookies=None, preserve_formatting=False):
1613
+
1614
+ assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
1615
+
1616
+ data = {}
1617
+ unretrievable_videos = []
1618
+
1619
+ for video_id in video_ids:
1620
+ try:
1621
+ data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
1622
+ except Exception as exception:
1623
+ if not continue_after_error:
1624
+ raise exception
1625
+
1626
+ unretrievable_videos.append(video_id)
1627
+
1628
+ return data, unretrievable_videos
1629
+
1630
+ @classmethod
1631
+ def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
1632
+ assert isinstance(video_id, str), "`video_id` must be a string"
1633
+ return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
1634
+
1635
+ @classmethod
1636
+ def _load_cookies(cls, cookies, video_id):
1637
+ try:
1638
+ cookie_jar = cookiejar.MozillaCookieJar()
1639
+ cookie_jar.load(cookies)
1640
+ if not cookie_jar:
1641
+ raise CookiesInvalidError(video_id)
1642
+ return cookie_jar
1643
+ except:
1644
+ raise CookiePathInvalidError(video_id)