reach-vb HF staff commited on
Commit
2d15f4c
1 Parent(s): 908850a

6ffebc3d3f8eb79ecf86f0d3426ce9146bd32b3e8e6f4553883490b0176b1deb

Browse files
Files changed (50) hide show
  1. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/INSTALLER +1 -0
  2. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/LICENSE +20 -0
  3. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/METADATA +66 -0
  4. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/RECORD +14 -0
  5. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/WHEEL +5 -0
  6. lib/python3.11/site-packages/certifi-2023.11.17.dist-info/top_level.txt +1 -0
  7. lib/python3.11/site-packages/certifi/__pycache__/core.cpython-311.pyc +0 -0
  8. lib/python3.11/site-packages/certifi/cacert.pem +0 -0
  9. lib/python3.11/site-packages/certifi/core.py +108 -0
  10. lib/python3.11/site-packages/certifi/py.typed +0 -0
  11. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER +1 -0
  12. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE +21 -0
  13. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/METADATA +683 -0
  14. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/RECORD +35 -0
  15. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL +5 -0
  16. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt +2 -0
  17. lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt +1 -0
  18. lib/python3.11/site-packages/charset_normalizer/__init__.py +46 -0
  19. lib/python3.11/site-packages/charset_normalizer/__main__.py +4 -0
  20. lib/python3.11/site-packages/charset_normalizer/__pycache__/__init__.cpython-311.pyc +0 -0
  21. lib/python3.11/site-packages/charset_normalizer/__pycache__/__main__.cpython-311.pyc +0 -0
  22. lib/python3.11/site-packages/charset_normalizer/__pycache__/api.cpython-311.pyc +0 -0
  23. lib/python3.11/site-packages/charset_normalizer/__pycache__/cd.cpython-311.pyc +0 -0
  24. lib/python3.11/site-packages/charset_normalizer/__pycache__/constant.cpython-311.pyc +0 -0
  25. lib/python3.11/site-packages/charset_normalizer/__pycache__/legacy.cpython-311.pyc +0 -0
  26. lib/python3.11/site-packages/charset_normalizer/__pycache__/md.cpython-311.pyc +0 -0
  27. lib/python3.11/site-packages/charset_normalizer/__pycache__/models.cpython-311.pyc +0 -0
  28. lib/python3.11/site-packages/charset_normalizer/__pycache__/utils.cpython-311.pyc +0 -0
  29. lib/python3.11/site-packages/charset_normalizer/__pycache__/version.cpython-311.pyc +0 -0
  30. lib/python3.11/site-packages/charset_normalizer/api.py +626 -0
  31. lib/python3.11/site-packages/charset_normalizer/cd.py +395 -0
  32. lib/python3.11/site-packages/charset_normalizer/cli/__init__.py +6 -0
  33. lib/python3.11/site-packages/charset_normalizer/cli/__main__.py +296 -0
  34. lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc +0 -0
  35. lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc +0 -0
  36. lib/python3.11/site-packages/charset_normalizer/constant.py +1995 -0
  37. lib/python3.11/site-packages/charset_normalizer/legacy.py +54 -0
  38. lib/python3.11/site-packages/charset_normalizer/md.cpython-311-darwin.so +0 -0
  39. lib/python3.11/site-packages/charset_normalizer/md.py +615 -0
  40. lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-darwin.so +0 -0
  41. lib/python3.11/site-packages/charset_normalizer/models.py +340 -0
  42. lib/python3.11/site-packages/charset_normalizer/py.typed +0 -0
  43. lib/python3.11/site-packages/charset_normalizer/utils.py +421 -0
  44. lib/python3.11/site-packages/charset_normalizer/version.py +6 -0
  45. lib/python3.11/site-packages/distutils-precedence.pth +3 -0
  46. lib/python3.11/site-packages/filelock/__init__.py +51 -0
  47. lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc +0 -0
  48. lib/python3.11/site-packages/filelock/__pycache__/_api.cpython-311.pyc +0 -0
  49. lib/python3.11/site-packages/filelock/__pycache__/_error.cpython-311.pyc +0 -0
  50. lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc +0 -0
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This package contains a modified version of ca-bundle.crt:
2
+
3
+ ca-bundle.crt -- Bundle of CA Root Certificates
4
+
5
+ This is a bundle of X.509 certificates of public Certificate Authorities
6
+ (CA). These were automatically extracted from Mozilla's root certificates
7
+ file (certdata.txt). This file can be found in the mozilla source tree:
8
+ https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
9
+ It contains the certificates in PEM format and therefore
10
+ can be directly used with curl / libcurl / php_curl, or with
11
+ an Apache+mod_ssl webserver for SSL client authentication.
12
+ Just configure this file as the SSLCACertificateFile.#
13
+
14
+ ***** BEGIN LICENSE BLOCK *****
15
+ This Source Code Form is subject to the terms of the Mozilla Public License,
16
+ v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
17
+ one at http://mozilla.org/MPL/2.0/.
18
+
19
+ ***** END LICENSE BLOCK *****
20
+ @(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/METADATA ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: certifi
3
+ Version: 2023.11.17
4
+ Summary: Python package for providing Mozilla's CA Bundle.
5
+ Home-page: https://github.com/certifi/python-certifi
6
+ Author: Kenneth Reitz
7
+ Author-email: [email protected]
8
+ License: MPL-2.0
9
+ Project-URL: Source, https://github.com/certifi/python-certifi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
+ Classifier: Natural Language :: English
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Requires-Python: >=3.6
24
+ License-File: LICENSE
25
+
26
+ Certifi: Python SSL Certificates
27
+ ================================
28
+
29
+ Certifi provides Mozilla's carefully curated collection of Root Certificates for
30
+ validating the trustworthiness of SSL certificates while verifying the identity
31
+ of TLS hosts. It has been extracted from the `Requests`_ project.
32
+
33
+ Installation
34
+ ------------
35
+
36
+ ``certifi`` is available on PyPI. Simply install it with ``pip``::
37
+
38
+ $ pip install certifi
39
+
40
+ Usage
41
+ -----
42
+
43
+ To reference the installed certificate authority (CA) bundle, you can use the
44
+ built-in function::
45
+
46
+ >>> import certifi
47
+
48
+ >>> certifi.where()
49
+ '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
50
+
51
+ Or from the command line::
52
+
53
+ $ python -m certifi
54
+ /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
55
+
56
+ Enjoy!
57
+
58
+ .. _`Requests`: https://requests.readthedocs.io/en/master/
59
+
60
+ Addition/Removal of Certificates
61
+ --------------------------------
62
+
63
+ Certifi does not support any addition/removal or other modification of the
64
+ CA trust store content. This project is intended to provide a reliable and
65
+ highly portable root of trust to python deployments. Look to upstream projects
66
+ for methods to use alternate trust.
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/RECORD ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi-2023.11.17.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ certifi-2023.11.17.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
3
+ certifi-2023.11.17.dist-info/METADATA,sha256=P7BMxvbKUZTP20mLy_wc2atkEPFNVqElEzV6Mhaj3Zc,2172
4
+ certifi-2023.11.17.dist-info/RECORD,,
5
+ certifi-2023.11.17.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
6
+ certifi-2023.11.17.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
+ certifi/__init__.py,sha256=oYZVbNEJ66LQQamFRyuICe6FoYDmkY4j4fKEyO9D96c,94
8
+ certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
+ certifi/__pycache__/__init__.cpython-311.pyc,,
10
+ certifi/__pycache__/__main__.cpython-311.pyc,,
11
+ certifi/__pycache__/core.cpython-311.pyc,,
12
+ certifi/cacert.pem,sha256=z503-oFAev4R3MDXD-YCVhQiqiNEcIwyTkUE24xsV0g,290282
13
+ certifi/core.py,sha256=lhewz0zFb2b4ULsQurElmloYwQoecjWzPqY67P8T7iM,4219
14
+ certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.41.3)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ certifi
lib/python3.11/site-packages/certifi/__pycache__/core.cpython-311.pyc ADDED
Binary file (3.37 kB). View file
 
lib/python3.11/site-packages/certifi/cacert.pem ADDED
The diff for this file is too large to render. See raw diff
 
lib/python3.11/site-packages/certifi/core.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ certifi.py
3
+ ~~~~~~~~~~
4
+
5
+ This module returns the installation location of cacert.pem or its contents.
6
+ """
7
+ import sys
8
+
9
+
10
+ if sys.version_info >= (3, 11):
11
+
12
+ from importlib.resources import as_file, files
13
+
14
+ _CACERT_CTX = None
15
+ _CACERT_PATH = None
16
+
17
+ def where() -> str:
18
+ # This is slightly terrible, but we want to delay extracting the file
19
+ # in cases where we're inside of a zipimport situation until someone
20
+ # actually calls where(), but we don't want to re-extract the file
21
+ # on every call of where(), so we'll do it once then store it in a
22
+ # global variable.
23
+ global _CACERT_CTX
24
+ global _CACERT_PATH
25
+ if _CACERT_PATH is None:
26
+ # This is slightly janky, the importlib.resources API wants you to
27
+ # manage the cleanup of this file, so it doesn't actually return a
28
+ # path, it returns a context manager that will give you the path
29
+ # when you enter it and will do any cleanup when you leave it. In
30
+ # the common case of not needing a temporary file, it will just
31
+ # return the file system location and the __exit__() is a no-op.
32
+ #
33
+ # We also have to hold onto the actual context manager, because
34
+ # it will do the cleanup whenever it gets garbage collected, so
35
+ # we will also store that at the global level as well.
36
+ _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
37
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
38
+
39
+ return _CACERT_PATH
40
+
41
+ def contents() -> str:
42
+ return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
43
+
44
+ elif sys.version_info >= (3, 7):
45
+
46
+ from importlib.resources import path as get_path, read_text
47
+
48
+ _CACERT_CTX = None
49
+ _CACERT_PATH = None
50
+
51
+ def where() -> str:
52
+ # This is slightly terrible, but we want to delay extracting the
53
+ # file in cases where we're inside of a zipimport situation until
54
+ # someone actually calls where(), but we don't want to re-extract
55
+ # the file on every call of where(), so we'll do it once then store
56
+ # it in a global variable.
57
+ global _CACERT_CTX
58
+ global _CACERT_PATH
59
+ if _CACERT_PATH is None:
60
+ # This is slightly janky, the importlib.resources API wants you
61
+ # to manage the cleanup of this file, so it doesn't actually
62
+ # return a path, it returns a context manager that will give
63
+ # you the path when you enter it and will do any cleanup when
64
+ # you leave it. In the common case of not needing a temporary
65
+ # file, it will just return the file system location and the
66
+ # __exit__() is a no-op.
67
+ #
68
+ # We also have to hold onto the actual context manager, because
69
+ # it will do the cleanup whenever it gets garbage collected, so
70
+ # we will also store that at the global level as well.
71
+ _CACERT_CTX = get_path("certifi", "cacert.pem")
72
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
73
+
74
+ return _CACERT_PATH
75
+
76
+ def contents() -> str:
77
+ return read_text("certifi", "cacert.pem", encoding="ascii")
78
+
79
+ else:
80
+ import os
81
+ import types
82
+ from typing import Union
83
+
84
+ Package = Union[types.ModuleType, str]
85
+ Resource = Union[str, "os.PathLike"]
86
+
87
+ # This fallback will work for Python versions prior to 3.7 that lack the
88
+ # importlib.resources module but relies on the existing `where` function
89
+ # so won't address issues with environments like PyOxidizer that don't set
90
+ # __file__ on modules.
91
+ def read_text(
92
+ package: Package,
93
+ resource: Resource,
94
+ encoding: str = 'utf-8',
95
+ errors: str = 'strict'
96
+ ) -> str:
97
+ with open(where(), encoding=encoding) as data:
98
+ return data.read()
99
+
100
+ # If we don't have importlib.resources, then we will just do the old logic
101
+ # of assuming we're on the filesystem and munge the path directly.
102
+ def where() -> str:
103
+ f = os.path.dirname(__file__)
104
+
105
+ return os.path.join(f, "cacert.pem")
106
+
107
+ def contents() -> str:
108
+ return read_text("certifi", "cacert.pem", encoding="ascii")
lib/python3.11/site-packages/certifi/py.typed ADDED
File without changes
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2019 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/METADATA ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: charset-normalizer
3
+ Version: 3.3.2
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Home-page: https://github.com/Ousret/charset_normalizer
6
+ Author: Ahmed TAHRI
7
+ Author-email: [email protected]
8
+ License: MIT
9
+ Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
10
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
11
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
26
+ Classifier: Topic :: Text Processing :: Linguistic
27
+ Classifier: Topic :: Utilities
28
+ Classifier: Typing :: Typed
29
+ Requires-Python: >=3.7.0
30
+ Description-Content-Type: text/markdown
31
+ License-File: LICENSE
32
+ Provides-Extra: unicode_backport
33
+
34
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
35
+
36
+ <p align="center">
37
+ <sup>The Real First Universal Charset Detector</sup><br>
38
+ <a href="https://pypi.org/project/charset-normalizer">
39
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
40
+ </a>
41
+ <a href="https://pepy.tech/project/charset-normalizer/">
42
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
43
+ </a>
44
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
45
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
46
+ </a>
47
+ </p>
48
+ <p align="center">
49
+ <sup><i>Featured Packages</i></sup><br>
50
+ <a href="https://github.com/jawah/niquests">
51
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
52
+ </a>
53
+ <a href="https://github.com/jawah/wassima">
54
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
55
+ </a>
56
+ </p>
57
+ <p align="center">
58
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
59
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
60
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
61
+ </a>
62
+ </p>
63
+
64
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
65
+ > I'm trying to resolve the issue by taking a new approach.
66
+ > All IANA character set names for which the Python core library provides codecs are supported.
67
+
68
+ <p align="center">
69
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
70
+ </p>
71
+
72
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
73
+
74
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
75
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
76
+ | `Fast` | ❌ | ✅ | ✅ |
77
+ | `Universal**` | ❌ | ✅ | ❌ |
78
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
79
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
80
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
81
+ | `Native Python` | ✅ | ✅ | ❌ |
82
+ | `Detect spoken language` | ❌ | ✅ | N/A |
83
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
84
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
85
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
86
+
87
+ <p align="center">
88
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
89
+ </p>
90
+
91
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
92
+ Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
93
+
94
+ ## ⚡ Performance
95
+
96
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
97
+
98
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
99
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
100
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
101
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
102
+
103
+ | Package | 99th percentile | 95th percentile | 50th percentile |
104
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
105
+ | [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
106
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
107
+
108
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
109
+
110
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
111
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
112
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
113
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
114
+ > (eg. Supported Encoding) Challenge-them if you want.
115
+
116
+ ## ✨ Installation
117
+
118
+ Using pip:
119
+
120
+ ```sh
121
+ pip install charset-normalizer -U
122
+ ```
123
+
124
+ ## 🚀 Basic Usage
125
+
126
+ ### CLI
127
+ This package comes with a CLI.
128
+
129
+ ```
130
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
131
+ file [file ...]
132
+
133
+ The Real First Universal Charset Detector. Discover originating encoding used
134
+ on text file. Normalize text to unicode.
135
+
136
+ positional arguments:
137
+ files File(s) to be analysed
138
+
139
+ optional arguments:
140
+ -h, --help show this help message and exit
141
+ -v, --verbose Display complementary information about file if any.
142
+ Stdout will contain logs about the detection process.
143
+ -a, --with-alternative
144
+ Output complementary possibilities if any. Top-level
145
+ JSON WILL be a list.
146
+ -n, --normalize Permit to normalize input file. If not set, program
147
+ does not write anything.
148
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
149
+ JSON output.
150
+ -r, --replace Replace file when trying to normalize it instead of
151
+ creating a new one.
152
+ -f, --force Replace file without asking if you are sure, use this
153
+ flag with caution.
154
+ -t THRESHOLD, --threshold THRESHOLD
155
+ Define a custom maximum amount of chaos allowed in
156
+ decoded content. 0. <= chaos <= 1.
157
+ --version Show version information and exit.
158
+ ```
159
+
160
+ ```bash
161
+ normalizer ./data/sample.1.fr.srt
162
+ ```
163
+
164
+ or
165
+
166
+ ```bash
167
+ python -m charset_normalizer ./data/sample.1.fr.srt
168
+ ```
169
+
170
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
171
+
172
+ ```json
173
+ {
174
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
175
+ "encoding": "cp1252",
176
+ "encoding_aliases": [
177
+ "1252",
178
+ "windows_1252"
179
+ ],
180
+ "alternative_encodings": [
181
+ "cp1254",
182
+ "cp1256",
183
+ "cp1258",
184
+ "iso8859_14",
185
+ "iso8859_15",
186
+ "iso8859_16",
187
+ "iso8859_3",
188
+ "iso8859_9",
189
+ "latin_1",
190
+ "mbcs"
191
+ ],
192
+ "language": "French",
193
+ "alphabets": [
194
+ "Basic Latin",
195
+ "Latin-1 Supplement"
196
+ ],
197
+ "has_sig_or_bom": false,
198
+ "chaos": 0.149,
199
+ "coherence": 97.152,
200
+ "unicode_path": null,
201
+ "is_preferred": true
202
+ }
203
+ ```
204
+
205
+ ### Python
206
+ *Just print out normalized text*
207
+ ```python
208
+ from charset_normalizer import from_path
209
+
210
+ results = from_path('./my_subtitle.srt')
211
+
212
+ print(str(results.best()))
213
+ ```
214
+
215
+ *Upgrade your code without effort*
216
+ ```python
217
+ from charset_normalizer import detect
218
+ ```
219
+
220
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
221
+
222
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
223
+
224
+ ## 😇 Why
225
+
226
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
227
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
228
+
229
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
230
+ produce **two identical rendered string.**
231
+ What I want is to get readable text, the best I can.
232
+
233
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
234
+
235
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
236
+
237
+ ## 🍰 How
238
+
239
+ - Discard all charset encoding table that could not fit the binary content.
240
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
241
+ - Extract matches with the lowest mess detected.
242
+ - Additionally, we measure coherence / probe for a language.
243
+
244
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
245
+
246
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
247
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess.
248
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
249
+ improve or rewrite it.
250
+
251
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
252
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
253
+
254
+ ## ⚡ Known limitations
255
+
256
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
257
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
258
+
259
+ ## ⚠️ About Python EOLs
260
+
261
+ **If you are running:**
262
+
263
+ - Python >=2.7,<3.5: Unsupported
264
+ - Python 3.5: charset-normalizer < 2.1
265
+ - Python 3.6: charset-normalizer < 3.1
266
+ - Python 3.7: charset-normalizer < 4.0
267
+
268
+ Upgrade your Python interpreter as soon as possible.
269
+
270
+ ## 👤 Contributing
271
+
272
+ Contributions, issues and feature requests are very much welcome.<br />
273
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
274
+
275
+ ## 📝 License
276
+
277
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
278
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
279
+
280
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
281
+
282
+ ## 💼 For Enterprise
283
+
284
+ Professional support for charset-normalizer is available as part of the [Tidelift
285
+ Subscription][1]. Tidelift gives software development teams a single source for
286
+ purchasing and maintaining their software, with professional grade assurances
287
+ from the experts who know it best, while seamlessly integrating with existing
288
+ tools.
289
+
290
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
291
+
292
+ # Changelog
293
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
294
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
295
+
296
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
297
+
298
+ ### Fixed
299
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
300
+ - Regression on some detection case showcased in the documentation (#371)
301
+
302
+ ### Added
303
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
304
+
305
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
306
+
307
+ ### Changed
308
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
309
+ - Improved the general detection reliability based on reports from the community
310
+
311
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
312
+
313
+ ### Added
314
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
315
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
316
+
317
+ ### Removed
318
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
319
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
320
+
321
+ ### Changed
322
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
323
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
324
+
325
+ ### Fixed
326
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
327
+
328
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
329
+
330
+ ### Changed
331
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
332
+ - Minor improvement over the global detection reliability
333
+
334
+ ### Added
335
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
336
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
337
+ - Explicit support for Python 3.12
338
+
339
+ ### Fixed
340
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
341
+
342
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
343
+
344
+ ### Added
345
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
346
+
347
+ ### Removed
348
+ - Support for Python 3.6 (PR #260)
349
+
350
+ ### Changed
351
+ - Optional speedup provided by mypy/c 1.0.1
352
+
353
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
354
+
355
+ ### Fixed
356
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
357
+
358
+ ### Changed
359
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
360
+
361
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
362
+
363
+ ### Added
364
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
365
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
366
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
367
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
368
+
369
+ ### Changed
370
+ - Build with static metadata using 'build' frontend
371
+ - Make the language detection stricter
372
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
373
+
374
+ ### Fixed
375
+ - CLI with opt --normalize fail when using full path for files
376
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
377
+ - Sphinx warnings when generating the documentation
378
+
379
+ ### Removed
380
+ - Coherence detector no longer return 'Simple English' instead return 'English'
381
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
382
+ - Breaking: Method `first()` and `best()` from CharsetMatch
383
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
384
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
385
+ - Breaking: Top-level function `normalize`
386
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
387
+ - Support for the backport `unicodedata2`
388
+
389
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
390
+
391
+ ### Added
392
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
393
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
394
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
395
+
396
+ ### Changed
397
+ - Build with static metadata using 'build' frontend
398
+ - Make the language detection stricter
399
+
400
+ ### Fixed
401
+ - CLI with opt --normalize fail when using full path for files
402
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
403
+
404
+ ### Removed
405
+ - Coherence detector no longer return 'Simple English' instead return 'English'
406
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
407
+
408
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
409
+
410
+ ### Added
411
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
412
+
413
+ ### Removed
414
+ - Breaking: Method `first()` and `best()` from CharsetMatch
415
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
416
+
417
+ ### Fixed
418
+ - Sphinx warnings when generating the documentation
419
+
420
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
421
+
422
+ ### Changed
423
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
424
+
425
+ ### Removed
426
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
427
+ - Breaking: Top-level function `normalize`
428
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
429
+ - Support for the backport `unicodedata2`
430
+
431
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
432
+
433
+ ### Deprecated
434
+ - Function `normalize` scheduled for removal in 3.0
435
+
436
+ ### Changed
437
+ - Removed useless call to decode in fn is_unprintable (#206)
438
+
439
+ ### Fixed
440
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
441
+
442
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
443
+
444
+ ### Added
445
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
446
+
447
+ ### Changed
448
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
449
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
450
+
451
+ ### Fixed
452
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
453
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
454
+
455
+ ### Removed
456
+ - Support for Python 3.5 (PR #192)
457
+
458
+ ### Deprecated
459
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
460
+
461
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
462
+
463
+ ### Fixed
464
+ - ASCII miss-detection on rare cases (PR #170)
465
+
466
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
467
+
468
+ ### Added
469
+ - Explicit support for Python 3.11 (PR #164)
470
+
471
+ ### Changed
472
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
473
+
474
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
475
+
476
+ ### Fixed
477
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
478
+
479
+ ### Changed
480
+ - Skipping the language-detection (CD) on ASCII (PR #155)
481
+
482
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
483
+
484
+ ### Changed
485
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
486
+
487
+ ### Fixed
488
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
489
+
490
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
491
+ ### Changed
492
+ - Improvement over Vietnamese detection (PR #126)
493
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
494
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
495
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
496
+ - Code style as refactored by Sourcery-AI (PR #131)
497
+ - Minor adjustment on the MD around european words (PR #133)
498
+ - Remove and replace SRTs from assets / tests (PR #139)
499
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
500
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
501
+
502
+ ### Fixed
503
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
504
+ - Avoid using too insignificant chunk (PR #137)
505
+
506
+ ### Added
507
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
508
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
509
+
510
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
511
+ ### Added
512
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
513
+
514
+ ### Changed
515
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
516
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
517
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
518
+ - Various detection improvement (MD+CD) (PR #117)
519
+
520
+ ### Removed
521
+ - Remove redundant logging entry about detected language(s) (PR #115)
522
+
523
+ ### Fixed
524
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
525
+
526
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
527
+ ### Fixed
528
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
529
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
530
+
531
+ ### Changed
532
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
533
+
534
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
535
+ ### Changed
536
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
537
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
538
+ - The Unicode detection is slightly improved (PR #93)
539
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
540
+
541
+ ### Removed
542
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
543
+
544
+ ### Fixed
545
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
546
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
547
+ - The MANIFEST.in was not exhaustive (PR #78)
548
+
549
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
550
+ ### Fixed
551
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
552
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
553
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
554
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
555
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
556
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
557
+
558
+ ### Changed
559
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
560
+ - Allow fallback on specified encoding if any (PR #71)
561
+
562
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
563
+ ### Changed
564
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
565
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
566
+
567
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
568
+ ### Fixed
569
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
570
+
571
+ ### Changed
572
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
573
+
574
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
575
+ ### Fixed
576
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
577
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
578
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
579
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
580
+
581
+ ### Changed
582
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
583
+
584
+ ### Added
585
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
586
+
587
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
588
+ ### Changed
589
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
590
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
591
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
592
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
593
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
594
+ - utf_7 detection has been reinstated.
595
+
596
+ ### Removed
597
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
598
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
599
+ - The exception hook on UnicodeDecodeError has been removed.
600
+
601
+ ### Deprecated
602
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
603
+
604
+ ### Fixed
605
+ - The CLI output used the relative path of the file(s). Should be absolute.
606
+
607
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
608
+ ### Fixed
609
+ - Logger configuration/usage no longer conflict with others (PR #44)
610
+
611
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
612
+ ### Removed
613
+ - Using standard logging instead of using the package loguru.
614
+ - Dropping nose test framework in favor of the maintained pytest.
615
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
616
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
617
+ - Stop support for UTF-7 that does not contain a SIG.
618
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
619
+
620
+ ### Fixed
621
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
622
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
623
+
624
+ ### Changed
625
+ - Improving the package final size by compressing frequencies.json.
626
+ - Huge improvement over the larges payload.
627
+
628
+ ### Added
629
+ - CLI now produces JSON consumable output.
630
+ - Return ASCII if given sequences fit. Given reasonable confidence.
631
+
632
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
633
+
634
+ ### Fixed
635
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
636
+
637
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
638
+
639
+ ### Fixed
640
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
641
+
642
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
643
+
644
+ ### Fixed
645
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
646
+
647
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
648
+
649
+ ### Changed
650
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
651
+
652
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
653
+
654
+ ### Fixed
655
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
656
+
657
+ ### Changed
658
+ - Dependencies refactoring, constraints revised.
659
+
660
+ ### Added
661
+ - Add python 3.9 and 3.10 to the supported interpreters
662
+
663
+ MIT License
664
+
665
+ Copyright (c) 2019 TAHRI Ahmed R.
666
+
667
+ Permission is hereby granted, free of charge, to any person obtaining a copy
668
+ of this software and associated documentation files (the "Software"), to deal
669
+ in the Software without restriction, including without limitation the rights
670
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
671
+ copies of the Software, and to permit persons to whom the Software is
672
+ furnished to do so, subject to the following conditions:
673
+
674
+ The above copyright notice and this permission notice shall be included in all
675
+ copies or substantial portions of the Software.
676
+
677
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
678
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
679
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
680
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
681
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
682
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
683
+ SOFTWARE.
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../../bin/normalizer,sha256=YnnjWFnBo-5ncwqWx_Z70rELAOBcQEMlV7bxTgbUYVY,296
2
+ charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.3.2.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
4
+ charset_normalizer-3.3.2.dist-info/METADATA,sha256=cfLhl5A6SI-F0oclm8w8ux9wshL1nipdeCdVnYb4AaA,33550
5
+ charset_normalizer-3.3.2.dist-info/RECORD,,
6
+ charset_normalizer-3.3.2.dist-info/WHEEL,sha256=eaDTbMedWofVq8IZjew9qeAkoA5Sw2MOU2ppdIRr1Jg,110
7
+ charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
8
+ charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
10
+ charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
11
+ charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-311.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-311.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-311.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-311.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-311.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-311.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-311.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-311.pyc,,
21
+ charset_normalizer/api.py,sha256=WOlWjy6wT8SeMYFpaGbXZFN1TMXa-s8vZYfkL4G29iQ,21097
22
+ charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
23
+ charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
24
+ charset_normalizer/cli/__main__.py,sha256=2F-xURZJzo063Ye-2RLJ2wcmURpbKeAzKwpiws65dAs,9744
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc,,
27
+ charset_normalizer/constant.py,sha256=p0IsOVcEbPWYPOdWhnhRbjK1YVBy6fs05C5vKC-zoxU,40481
28
+ charset_normalizer/legacy.py,sha256=T-QuVMsMeDiQEk8WSszMrzVJg_14AMeSkmHdRYhdl1k,2071
29
+ charset_normalizer/md.cpython-311-darwin.so,sha256=zbs-p3GrSygP9-4v4GVAUcyRpreXASFbQqgK9rvFoKw,50117
30
+ charset_normalizer/md.py,sha256=NkSuVLK13_a8c7BxZ4cGIQ5vOtGIWOdh22WZEvjp-7U,19624
31
+ charset_normalizer/md__mypyc.cpython-311-darwin.so,sha256=5u-KvFhpxi_WDpF0bB0tfYS2z7PzQ08aO8DAOMiMAXI,232636
32
+ charset_normalizer/models.py,sha256=I5i0s4aKCCgLPY2tUY3pwkgFA-BUbbNxQ7hVkVTt62s,11624
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
35
+ charset_normalizer/version.py,sha256=iHKUfHD3kDRSyrh_BN2ojh43TA5-UZQjvbVIEFfpHDs,79
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.41.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-cp311-macosx_11_0_arm64
5
+
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer.cli:cli_detect
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
lib/python3.11/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Charset-Normalizer
4
+ ~~~~~~~~~~~~~~
5
+ The Real First Universal Charset Detector.
6
+ A library that helps you read text from an unknown charset encoding.
7
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
8
+ All IANA character set names for which the Python core library provides codecs are supported.
9
+
10
+ Basic usage:
11
+ >>> from charset_normalizer import from_bytes
12
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
13
+ >>> best_guess = results.best()
14
+ >>> str(best_guess)
15
+ 'Bсеки човек има право на образование. Oбразованието!'
16
+
17
+ Others methods and usages are available - see the full documentation
18
+ at <https://github.com/Ousret/charset_normalizer>.
19
+ :copyright: (c) 2021 by Ahmed TAHRI
20
+ :license: MIT, see LICENSE for more details.
21
+ """
22
+ import logging
23
+
24
+ from .api import from_bytes, from_fp, from_path, is_binary
25
+ from .legacy import detect
26
+ from .models import CharsetMatch, CharsetMatches
27
+ from .utils import set_logging_handler
28
+ from .version import VERSION, __version__
29
+
30
+ __all__ = (
31
+ "from_fp",
32
+ "from_path",
33
+ "from_bytes",
34
+ "is_binary",
35
+ "detect",
36
+ "CharsetMatch",
37
+ "CharsetMatches",
38
+ "__version__",
39
+ "VERSION",
40
+ "set_logging_handler",
41
+ )
42
+
43
+ # Attach a NullHandler to the top level logger by default
44
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
45
+
46
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
lib/python3.11/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .cli import cli_detect
2
+
3
+ if __name__ == "__main__":
4
+ cli_detect()
lib/python3.11/site-packages/charset_normalizer/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.88 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/__main__.cpython-311.pyc ADDED
Binary file (369 Bytes). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/api.cpython-311.pyc ADDED
Binary file (20.5 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/cd.cpython-311.pyc ADDED
Binary file (16.2 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/constant.cpython-311.pyc ADDED
Binary file (43.7 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/legacy.cpython-311.pyc ADDED
Binary file (2.8 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/md.cpython-311.pyc ADDED
Binary file (27.4 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/models.cpython-311.pyc ADDED
Binary file (18.1 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/utils.cpython-311.pyc ADDED
Binary file (16.4 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/__pycache__/version.cpython-311.pyc ADDED
Binary file (375 Bytes). View file
 
lib/python3.11/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from os import PathLike
3
+ from typing import BinaryIO, List, Optional, Set, Union
4
+
5
+ from .cd import (
6
+ coherence_ratio,
7
+ encoding_languages,
8
+ mb_encoding_languages,
9
+ merge_coherence_ratios,
10
+ )
11
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
12
+ from .md import mess_ratio
13
+ from .models import CharsetMatch, CharsetMatches
14
+ from .utils import (
15
+ any_specified_encoding,
16
+ cut_sequence_chunks,
17
+ iana_name,
18
+ identify_sig_or_bom,
19
+ is_cp_similar,
20
+ is_multi_byte_encoding,
21
+ should_strip_sig_or_bom,
22
+ )
23
+
24
+ # Will most likely be controversial
25
+ # logging.addLevelName(TRACE, "TRACE")
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: Union[bytes, bytearray],
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: Optional[List[str]] = None,
39
+ cp_exclusion: Optional[List[str]] = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {0}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain:
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: List[str] = []
139
+
140
+ specified_encoding: Optional[str] = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: Set[str] = set()
153
+ tested_but_hard_failure: List[str] = []
154
+ tested_but_soft_failure: List[str] = []
155
+
156
+ fallback_ascii: Optional[CharsetMatch] = None
157
+ fallback_u8: Optional[CharsetMatch] = None
158
+ fallback_specified: Optional[CharsetMatch] = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
163
+
164
+ if sig_encoding is not None:
165
+ prioritized_encodings.append(sig_encoding)
166
+ logger.log(
167
+ TRACE,
168
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
169
+ len(sig_payload),
170
+ sig_encoding,
171
+ )
172
+
173
+ prioritized_encodings.append("ascii")
174
+
175
+ if "utf_8" not in prioritized_encodings:
176
+ prioritized_encodings.append("utf_8")
177
+
178
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
179
+ if cp_isolation and encoding_iana not in cp_isolation:
180
+ continue
181
+
182
+ if cp_exclusion and encoding_iana in cp_exclusion:
183
+ continue
184
+
185
+ if encoding_iana in tested:
186
+ continue
187
+
188
+ tested.add(encoding_iana)
189
+
190
+ decoded_payload: Optional[str] = None
191
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
192
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
193
+ encoding_iana
194
+ )
195
+
196
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
197
+ logger.log(
198
+ TRACE,
199
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
200
+ encoding_iana,
201
+ )
202
+ continue
203
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
204
+ logger.log(
205
+ TRACE,
206
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
207
+ encoding_iana,
208
+ )
209
+ continue
210
+
211
+ try:
212
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
213
+ except (ModuleNotFoundError, ImportError):
214
+ logger.log(
215
+ TRACE,
216
+ "Encoding %s does not provide an IncrementalDecoder",
217
+ encoding_iana,
218
+ )
219
+ continue
220
+
221
+ try:
222
+ if is_too_large_sequence and is_multi_byte_decoder is False:
223
+ str(
224
+ sequences[: int(50e4)]
225
+ if strip_sig_or_bom is False
226
+ else sequences[len(sig_payload) : int(50e4)],
227
+ encoding=encoding_iana,
228
+ )
229
+ else:
230
+ decoded_payload = str(
231
+ sequences
232
+ if strip_sig_or_bom is False
233
+ else sequences[len(sig_payload) :],
234
+ encoding=encoding_iana,
235
+ )
236
+ except (UnicodeDecodeError, LookupError) as e:
237
+ if not isinstance(e, LookupError):
238
+ logger.log(
239
+ TRACE,
240
+ "Code page %s does not fit given bytes sequence at ALL. %s",
241
+ encoding_iana,
242
+ str(e),
243
+ )
244
+ tested_but_hard_failure.append(encoding_iana)
245
+ continue
246
+
247
+ similar_soft_failure_test: bool = False
248
+
249
+ for encoding_soft_failed in tested_but_soft_failure:
250
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
251
+ similar_soft_failure_test = True
252
+ break
253
+
254
+ if similar_soft_failure_test:
255
+ logger.log(
256
+ TRACE,
257
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
258
+ encoding_iana,
259
+ encoding_soft_failed,
260
+ )
261
+ continue
262
+
263
+ r_ = range(
264
+ 0 if not bom_or_sig_available else len(sig_payload),
265
+ length,
266
+ int(length / steps),
267
+ )
268
+
269
+ multi_byte_bonus: bool = (
270
+ is_multi_byte_decoder
271
+ and decoded_payload is not None
272
+ and len(decoded_payload) < length
273
+ )
274
+
275
+ if multi_byte_bonus:
276
+ logger.log(
277
+ TRACE,
278
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
279
+ "was encoded using n-bytes.",
280
+ encoding_iana,
281
+ )
282
+
283
+ max_chunk_gave_up: int = int(len(r_) / 4)
284
+
285
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
286
+ early_stop_count: int = 0
287
+ lazy_str_hard_failure = False
288
+
289
+ md_chunks: List[str] = []
290
+ md_ratios = []
291
+
292
+ try:
293
+ for chunk in cut_sequence_chunks(
294
+ sequences,
295
+ encoding_iana,
296
+ r_,
297
+ chunk_size,
298
+ bom_or_sig_available,
299
+ strip_sig_or_bom,
300
+ sig_payload,
301
+ is_multi_byte_decoder,
302
+ decoded_payload,
303
+ ):
304
+ md_chunks.append(chunk)
305
+
306
+ md_ratios.append(
307
+ mess_ratio(
308
+ chunk,
309
+ threshold,
310
+ explain is True and 1 <= len(cp_isolation) <= 2,
311
+ )
312
+ )
313
+
314
+ if md_ratios[-1] >= threshold:
315
+ early_stop_count += 1
316
+
317
+ if (early_stop_count >= max_chunk_gave_up) or (
318
+ bom_or_sig_available and strip_sig_or_bom is False
319
+ ):
320
+ break
321
+ except (
322
+ UnicodeDecodeError
323
+ ) as e: # Lazy str loading may have missed something there
324
+ logger.log(
325
+ TRACE,
326
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
327
+ encoding_iana,
328
+ str(e),
329
+ )
330
+ early_stop_count = max_chunk_gave_up
331
+ lazy_str_hard_failure = True
332
+
333
+ # We might want to check the sequence again with the whole content
334
+ # Only if initial MD tests passes
335
+ if (
336
+ not lazy_str_hard_failure
337
+ and is_too_large_sequence
338
+ and not is_multi_byte_decoder
339
+ ):
340
+ try:
341
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
342
+ except UnicodeDecodeError as e:
343
+ logger.log(
344
+ TRACE,
345
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
346
+ encoding_iana,
347
+ str(e),
348
+ )
349
+ tested_but_hard_failure.append(encoding_iana)
350
+ continue
351
+
352
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
353
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
354
+ tested_but_soft_failure.append(encoding_iana)
355
+ logger.log(
356
+ TRACE,
357
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
358
+ "Computed mean chaos is %f %%.",
359
+ encoding_iana,
360
+ early_stop_count,
361
+ round(mean_mess_ratio * 100, ndigits=3),
362
+ )
363
+ # Preparing those fallbacks in case we got nothing.
364
+ if (
365
+ enable_fallback
366
+ and encoding_iana in ["ascii", "utf_8", specified_encoding]
367
+ and not lazy_str_hard_failure
368
+ ):
369
+ fallback_entry = CharsetMatch(
370
+ sequences, encoding_iana, threshold, False, [], decoded_payload
371
+ )
372
+ if encoding_iana == specified_encoding:
373
+ fallback_specified = fallback_entry
374
+ elif encoding_iana == "ascii":
375
+ fallback_ascii = fallback_entry
376
+ else:
377
+ fallback_u8 = fallback_entry
378
+ continue
379
+
380
+ logger.log(
381
+ TRACE,
382
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
383
+ encoding_iana,
384
+ round(mean_mess_ratio * 100, ndigits=3),
385
+ )
386
+
387
+ if not is_multi_byte_decoder:
388
+ target_languages: List[str] = encoding_languages(encoding_iana)
389
+ else:
390
+ target_languages = mb_encoding_languages(encoding_iana)
391
+
392
+ if target_languages:
393
+ logger.log(
394
+ TRACE,
395
+ "{} should target any language(s) of {}".format(
396
+ encoding_iana, str(target_languages)
397
+ ),
398
+ )
399
+
400
+ cd_ratios = []
401
+
402
+ # We shall skip the CD when its about ASCII
403
+ # Most of the time its not relevant to run "language-detection" on it.
404
+ if encoding_iana != "ascii":
405
+ for chunk in md_chunks:
406
+ chunk_languages = coherence_ratio(
407
+ chunk,
408
+ language_threshold,
409
+ ",".join(target_languages) if target_languages else None,
410
+ )
411
+
412
+ cd_ratios.append(chunk_languages)
413
+
414
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
415
+
416
+ if cd_ratios_merged:
417
+ logger.log(
418
+ TRACE,
419
+ "We detected language {} using {}".format(
420
+ cd_ratios_merged, encoding_iana
421
+ ),
422
+ )
423
+
424
+ results.append(
425
+ CharsetMatch(
426
+ sequences,
427
+ encoding_iana,
428
+ mean_mess_ratio,
429
+ bom_or_sig_available,
430
+ cd_ratios_merged,
431
+ decoded_payload,
432
+ )
433
+ )
434
+
435
+ if (
436
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
437
+ and mean_mess_ratio < 0.1
438
+ ):
439
+ logger.debug(
440
+ "Encoding detection: %s is most likely the one.", encoding_iana
441
+ )
442
+ if explain:
443
+ logger.removeHandler(explain_handler)
444
+ logger.setLevel(previous_logger_level)
445
+ return CharsetMatches([results[encoding_iana]])
446
+
447
+ if encoding_iana == sig_encoding:
448
+ logger.debug(
449
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
450
+ "the beginning of the sequence.",
451
+ encoding_iana,
452
+ )
453
+ if explain:
454
+ logger.removeHandler(explain_handler)
455
+ logger.setLevel(previous_logger_level)
456
+ return CharsetMatches([results[encoding_iana]])
457
+
458
+ if len(results) == 0:
459
+ if fallback_u8 or fallback_ascii or fallback_specified:
460
+ logger.log(
461
+ TRACE,
462
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
463
+ )
464
+
465
+ if fallback_specified:
466
+ logger.debug(
467
+ "Encoding detection: %s will be used as a fallback match",
468
+ fallback_specified.encoding,
469
+ )
470
+ results.append(fallback_specified)
471
+ elif (
472
+ (fallback_u8 and fallback_ascii is None)
473
+ or (
474
+ fallback_u8
475
+ and fallback_ascii
476
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
477
+ )
478
+ or (fallback_u8 is not None)
479
+ ):
480
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
481
+ results.append(fallback_u8)
482
+ elif fallback_ascii:
483
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
484
+ results.append(fallback_ascii)
485
+
486
+ if results:
487
+ logger.debug(
488
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
489
+ results.best().encoding, # type: ignore
490
+ len(results) - 1,
491
+ )
492
+ else:
493
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
494
+
495
+ if explain:
496
+ logger.removeHandler(explain_handler)
497
+ logger.setLevel(previous_logger_level)
498
+
499
+ return results
500
+
501
+
502
+ def from_fp(
503
+ fp: BinaryIO,
504
+ steps: int = 5,
505
+ chunk_size: int = 512,
506
+ threshold: float = 0.20,
507
+ cp_isolation: Optional[List[str]] = None,
508
+ cp_exclusion: Optional[List[str]] = None,
509
+ preemptive_behaviour: bool = True,
510
+ explain: bool = False,
511
+ language_threshold: float = 0.1,
512
+ enable_fallback: bool = True,
513
+ ) -> CharsetMatches:
514
+ """
515
+ Same thing than the function from_bytes but using a file pointer that is already ready.
516
+ Will not close the file pointer.
517
+ """
518
+ return from_bytes(
519
+ fp.read(),
520
+ steps,
521
+ chunk_size,
522
+ threshold,
523
+ cp_isolation,
524
+ cp_exclusion,
525
+ preemptive_behaviour,
526
+ explain,
527
+ language_threshold,
528
+ enable_fallback,
529
+ )
530
+
531
+
532
+ def from_path(
533
+ path: Union[str, bytes, PathLike], # type: ignore[type-arg]
534
+ steps: int = 5,
535
+ chunk_size: int = 512,
536
+ threshold: float = 0.20,
537
+ cp_isolation: Optional[List[str]] = None,
538
+ cp_exclusion: Optional[List[str]] = None,
539
+ preemptive_behaviour: bool = True,
540
+ explain: bool = False,
541
+ language_threshold: float = 0.1,
542
+ enable_fallback: bool = True,
543
+ ) -> CharsetMatches:
544
+ """
545
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
546
+ Can raise IOError.
547
+ """
548
+ with open(path, "rb") as fp:
549
+ return from_fp(
550
+ fp,
551
+ steps,
552
+ chunk_size,
553
+ threshold,
554
+ cp_isolation,
555
+ cp_exclusion,
556
+ preemptive_behaviour,
557
+ explain,
558
+ language_threshold,
559
+ enable_fallback,
560
+ )
561
+
562
+
563
+ def is_binary(
564
+ fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
565
+ steps: int = 5,
566
+ chunk_size: int = 512,
567
+ threshold: float = 0.20,
568
+ cp_isolation: Optional[List[str]] = None,
569
+ cp_exclusion: Optional[List[str]] = None,
570
+ preemptive_behaviour: bool = True,
571
+ explain: bool = False,
572
+ language_threshold: float = 0.1,
573
+ enable_fallback: bool = False,
574
+ ) -> bool:
575
+ """
576
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
577
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
578
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
579
+ """
580
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
581
+ guesses = from_path(
582
+ fp_or_path_or_payload,
583
+ steps=steps,
584
+ chunk_size=chunk_size,
585
+ threshold=threshold,
586
+ cp_isolation=cp_isolation,
587
+ cp_exclusion=cp_exclusion,
588
+ preemptive_behaviour=preemptive_behaviour,
589
+ explain=explain,
590
+ language_threshold=language_threshold,
591
+ enable_fallback=enable_fallback,
592
+ )
593
+ elif isinstance(
594
+ fp_or_path_or_payload,
595
+ (
596
+ bytes,
597
+ bytearray,
598
+ ),
599
+ ):
600
+ guesses = from_bytes(
601
+ fp_or_path_or_payload,
602
+ steps=steps,
603
+ chunk_size=chunk_size,
604
+ threshold=threshold,
605
+ cp_isolation=cp_isolation,
606
+ cp_exclusion=cp_exclusion,
607
+ preemptive_behaviour=preemptive_behaviour,
608
+ explain=explain,
609
+ language_threshold=language_threshold,
610
+ enable_fallback=enable_fallback,
611
+ )
612
+ else:
613
+ guesses = from_fp(
614
+ fp_or_path_or_payload,
615
+ steps=steps,
616
+ chunk_size=chunk_size,
617
+ threshold=threshold,
618
+ cp_isolation=cp_isolation,
619
+ cp_exclusion=cp_exclusion,
620
+ preemptive_behaviour=preemptive_behaviour,
621
+ explain=explain,
622
+ language_threshold=language_threshold,
623
+ enable_fallback=enable_fallback,
624
+ )
625
+
626
+ return not guesses
lib/python3.11/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ from codecs import IncrementalDecoder
3
+ from collections import Counter
4
+ from functools import lru_cache
5
+ from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
6
+
7
+ from .constant import (
8
+ FREQUENCIES,
9
+ KO_NAMES,
10
+ LANGUAGE_SUPPORTED_COUNT,
11
+ TOO_SMALL_SEQUENCE,
12
+ ZH_NAMES,
13
+ )
14
+ from .md import is_suspiciously_successive_range
15
+ from .models import CoherenceMatches
16
+ from .utils import (
17
+ is_accentuated,
18
+ is_latin,
19
+ is_multi_byte_encoding,
20
+ is_unicode_range_secondary,
21
+ unicode_range,
22
+ )
23
+
24
+
25
+ def encoding_unicode_range(iana_name: str) -> List[str]:
26
+ """
27
+ Return associated unicode ranges in a single byte code page.
28
+ """
29
+ if is_multi_byte_encoding(iana_name):
30
+ raise IOError("Function not supported on multi-byte code page")
31
+
32
+ decoder = importlib.import_module(
33
+ "encodings.{}".format(iana_name)
34
+ ).IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: Dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: Optional[str] = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> List[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: List[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> List[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: List[str] = encoding_unicode_range(iana_name)
86
+ primary_range: Optional[str] = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> List[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> Tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: List[str], ignore_non_latin: bool = False
139
+ ) -> List[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: List[Tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: List[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError("{} not available".format(language))
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: List[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: List[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: List[str] = ordered_characters[0:character_rank]
224
+ characters_after: List[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: Dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: Optional[str] = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: Optional[str] = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: Dict[str, List[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: Dict[str, List[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: List[Tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: List[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
lib/python3.11/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .__main__ import cli_detect, query_yes_no
2
+
3
+ __all__ = (
4
+ "cli_detect",
5
+ "query_yes_no",
6
+ )
lib/python3.11/site-packages/charset_normalizer/cli/__main__.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ from json import dumps
4
+ from os.path import abspath, basename, dirname, join, realpath
5
+ from platform import python_version
6
+ from typing import List, Optional
7
+ from unicodedata import unidata_version
8
+
9
+ import charset_normalizer.md as md_module
10
+ from charset_normalizer import from_fp
11
+ from charset_normalizer.models import CliDetectionResult
12
+ from charset_normalizer.version import __version__
13
+
14
+
15
+ def query_yes_no(question: str, default: str = "yes") -> bool:
16
+ """Ask a yes/no question via input() and return their answer.
17
+
18
+ "question" is a string that is presented to the user.
19
+ "default" is the presumed answer if the user just hits <Enter>.
20
+ It must be "yes" (the default), "no" or None (meaning
21
+ an answer is required of the user).
22
+
23
+ The "answer" return value is True for "yes" or False for "no".
24
+
25
+ Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
26
+ """
27
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
28
+ if default is None:
29
+ prompt = " [y/n] "
30
+ elif default == "yes":
31
+ prompt = " [Y/n] "
32
+ elif default == "no":
33
+ prompt = " [y/N] "
34
+ else:
35
+ raise ValueError("invalid default answer: '%s'" % default)
36
+
37
+ while True:
38
+ sys.stdout.write(question + prompt)
39
+ choice = input().lower()
40
+ if default is not None and choice == "":
41
+ return valid[default]
42
+ elif choice in valid:
43
+ return valid[choice]
44
+ else:
45
+ sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
46
+
47
+
48
+ def cli_detect(argv: Optional[List[str]] = None) -> int:
49
+ """
50
+ CLI assistant using ARGV and ArgumentParser
51
+ :param argv:
52
+ :return: 0 if everything is fine, anything else equal trouble
53
+ """
54
+ parser = argparse.ArgumentParser(
55
+ description="The Real First Universal Charset Detector. "
56
+ "Discover originating encoding used on text file. "
57
+ "Normalize text to unicode."
58
+ )
59
+
60
+ parser.add_argument(
61
+ "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
62
+ )
63
+ parser.add_argument(
64
+ "-v",
65
+ "--verbose",
66
+ action="store_true",
67
+ default=False,
68
+ dest="verbose",
69
+ help="Display complementary information about file if any. "
70
+ "Stdout will contain logs about the detection process.",
71
+ )
72
+ parser.add_argument(
73
+ "-a",
74
+ "--with-alternative",
75
+ action="store_true",
76
+ default=False,
77
+ dest="alternatives",
78
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
79
+ )
80
+ parser.add_argument(
81
+ "-n",
82
+ "--normalize",
83
+ action="store_true",
84
+ default=False,
85
+ dest="normalize",
86
+ help="Permit to normalize input file. If not set, program does not write anything.",
87
+ )
88
+ parser.add_argument(
89
+ "-m",
90
+ "--minimal",
91
+ action="store_true",
92
+ default=False,
93
+ dest="minimal",
94
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
95
+ )
96
+ parser.add_argument(
97
+ "-r",
98
+ "--replace",
99
+ action="store_true",
100
+ default=False,
101
+ dest="replace",
102
+ help="Replace file when trying to normalize it instead of creating a new one.",
103
+ )
104
+ parser.add_argument(
105
+ "-f",
106
+ "--force",
107
+ action="store_true",
108
+ default=False,
109
+ dest="force",
110
+ help="Replace file without asking if you are sure, use this flag with caution.",
111
+ )
112
+ parser.add_argument(
113
+ "-t",
114
+ "--threshold",
115
+ action="store",
116
+ default=0.2,
117
+ type=float,
118
+ dest="threshold",
119
+ help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
120
+ )
121
+ parser.add_argument(
122
+ "--version",
123
+ action="version",
124
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
125
+ __version__,
126
+ python_version(),
127
+ unidata_version,
128
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
129
+ ),
130
+ help="Show version information and exit.",
131
+ )
132
+
133
+ args = parser.parse_args(argv)
134
+
135
+ if args.replace is True and args.normalize is False:
136
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
137
+ return 1
138
+
139
+ if args.force is True and args.replace is False:
140
+ print("Use --force in addition of --replace only.", file=sys.stderr)
141
+ return 1
142
+
143
+ if args.threshold < 0.0 or args.threshold > 1.0:
144
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
145
+ return 1
146
+
147
+ x_ = []
148
+
149
+ for my_file in args.files:
150
+ matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
151
+
152
+ best_guess = matches.best()
153
+
154
+ if best_guess is None:
155
+ print(
156
+ 'Unable to identify originating encoding for "{}". {}'.format(
157
+ my_file.name,
158
+ "Maybe try increasing maximum amount of chaos."
159
+ if args.threshold < 1.0
160
+ else "",
161
+ ),
162
+ file=sys.stderr,
163
+ )
164
+ x_.append(
165
+ CliDetectionResult(
166
+ abspath(my_file.name),
167
+ None,
168
+ [],
169
+ [],
170
+ "Unknown",
171
+ [],
172
+ False,
173
+ 1.0,
174
+ 0.0,
175
+ None,
176
+ True,
177
+ )
178
+ )
179
+ else:
180
+ x_.append(
181
+ CliDetectionResult(
182
+ abspath(my_file.name),
183
+ best_guess.encoding,
184
+ best_guess.encoding_aliases,
185
+ [
186
+ cp
187
+ for cp in best_guess.could_be_from_charset
188
+ if cp != best_guess.encoding
189
+ ],
190
+ best_guess.language,
191
+ best_guess.alphabets,
192
+ best_guess.bom,
193
+ best_guess.percent_chaos,
194
+ best_guess.percent_coherence,
195
+ None,
196
+ True,
197
+ )
198
+ )
199
+
200
+ if len(matches) > 1 and args.alternatives:
201
+ for el in matches:
202
+ if el != best_guess:
203
+ x_.append(
204
+ CliDetectionResult(
205
+ abspath(my_file.name),
206
+ el.encoding,
207
+ el.encoding_aliases,
208
+ [
209
+ cp
210
+ for cp in el.could_be_from_charset
211
+ if cp != el.encoding
212
+ ],
213
+ el.language,
214
+ el.alphabets,
215
+ el.bom,
216
+ el.percent_chaos,
217
+ el.percent_coherence,
218
+ None,
219
+ False,
220
+ )
221
+ )
222
+
223
+ if args.normalize is True:
224
+ if best_guess.encoding.startswith("utf") is True:
225
+ print(
226
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
227
+ my_file.name
228
+ ),
229
+ file=sys.stderr,
230
+ )
231
+ if my_file.closed is False:
232
+ my_file.close()
233
+ continue
234
+
235
+ dir_path = dirname(realpath(my_file.name))
236
+ file_name = basename(realpath(my_file.name))
237
+
238
+ o_: List[str] = file_name.split(".")
239
+
240
+ if args.replace is False:
241
+ o_.insert(-1, best_guess.encoding)
242
+ if my_file.closed is False:
243
+ my_file.close()
244
+ elif (
245
+ args.force is False
246
+ and query_yes_no(
247
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
248
+ my_file.name
249
+ ),
250
+ "no",
251
+ )
252
+ is False
253
+ ):
254
+ if my_file.closed is False:
255
+ my_file.close()
256
+ continue
257
+
258
+ try:
259
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
260
+
261
+ with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
262
+ fp.write(str(best_guess))
263
+ except IOError as e:
264
+ print(str(e), file=sys.stderr)
265
+ if my_file.closed is False:
266
+ my_file.close()
267
+ return 2
268
+
269
+ if my_file.closed is False:
270
+ my_file.close()
271
+
272
+ if args.minimal is False:
273
+ print(
274
+ dumps(
275
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
276
+ ensure_ascii=True,
277
+ indent=4,
278
+ )
279
+ )
280
+ else:
281
+ for my_file in args.files:
282
+ print(
283
+ ", ".join(
284
+ [
285
+ el.encoding or "undefined"
286
+ for el in x_
287
+ if el.path == abspath(my_file.name)
288
+ ]
289
+ )
290
+ )
291
+
292
+ return 0
293
+
294
+
295
+ if __name__ == "__main__":
296
+ cli_detect()
lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (341 Bytes). View file
 
lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,1995 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
3
+ from encodings.aliases import aliases
4
+ from re import IGNORECASE, compile as re_compile
5
+ from typing import Dict, List, Set, Union
6
+
7
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
8
+ ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
9
+ "utf_8": BOM_UTF8,
10
+ "utf_7": [
11
+ b"\x2b\x2f\x76\x38",
12
+ b"\x2b\x2f\x76\x39",
13
+ b"\x2b\x2f\x76\x2b",
14
+ b"\x2b\x2f\x76\x2f",
15
+ b"\x2b\x2f\x76\x38\x2d",
16
+ ],
17
+ "gb18030": b"\x84\x31\x95\x33",
18
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
19
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
20
+ }
21
+
22
+ TOO_SMALL_SEQUENCE: int = 32
23
+ TOO_BIG_SEQUENCE: int = int(10e6)
24
+
25
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
26
+
27
+ # Up-to-date Unicode ucd/15.0.0
28
+ UNICODE_RANGES_COMBINED: Dict[str, range] = {
29
+ "Control character": range(32),
30
+ "Basic Latin": range(32, 128),
31
+ "Latin-1 Supplement": range(128, 256),
32
+ "Latin Extended-A": range(256, 384),
33
+ "Latin Extended-B": range(384, 592),
34
+ "IPA Extensions": range(592, 688),
35
+ "Spacing Modifier Letters": range(688, 768),
36
+ "Combining Diacritical Marks": range(768, 880),
37
+ "Greek and Coptic": range(880, 1024),
38
+ "Cyrillic": range(1024, 1280),
39
+ "Cyrillic Supplement": range(1280, 1328),
40
+ "Armenian": range(1328, 1424),
41
+ "Hebrew": range(1424, 1536),
42
+ "Arabic": range(1536, 1792),
43
+ "Syriac": range(1792, 1872),
44
+ "Arabic Supplement": range(1872, 1920),
45
+ "Thaana": range(1920, 1984),
46
+ "NKo": range(1984, 2048),
47
+ "Samaritan": range(2048, 2112),
48
+ "Mandaic": range(2112, 2144),
49
+ "Syriac Supplement": range(2144, 2160),
50
+ "Arabic Extended-B": range(2160, 2208),
51
+ "Arabic Extended-A": range(2208, 2304),
52
+ "Devanagari": range(2304, 2432),
53
+ "Bengali": range(2432, 2560),
54
+ "Gurmukhi": range(2560, 2688),
55
+ "Gujarati": range(2688, 2816),
56
+ "Oriya": range(2816, 2944),
57
+ "Tamil": range(2944, 3072),
58
+ "Telugu": range(3072, 3200),
59
+ "Kannada": range(3200, 3328),
60
+ "Malayalam": range(3328, 3456),
61
+ "Sinhala": range(3456, 3584),
62
+ "Thai": range(3584, 3712),
63
+ "Lao": range(3712, 3840),
64
+ "Tibetan": range(3840, 4096),
65
+ "Myanmar": range(4096, 4256),
66
+ "Georgian": range(4256, 4352),
67
+ "Hangul Jamo": range(4352, 4608),
68
+ "Ethiopic": range(4608, 4992),
69
+ "Ethiopic Supplement": range(4992, 5024),
70
+ "Cherokee": range(5024, 5120),
71
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
72
+ "Ogham": range(5760, 5792),
73
+ "Runic": range(5792, 5888),
74
+ "Tagalog": range(5888, 5920),
75
+ "Hanunoo": range(5920, 5952),
76
+ "Buhid": range(5952, 5984),
77
+ "Tagbanwa": range(5984, 6016),
78
+ "Khmer": range(6016, 6144),
79
+ "Mongolian": range(6144, 6320),
80
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
81
+ "Limbu": range(6400, 6480),
82
+ "Tai Le": range(6480, 6528),
83
+ "New Tai Lue": range(6528, 6624),
84
+ "Khmer Symbols": range(6624, 6656),
85
+ "Buginese": range(6656, 6688),
86
+ "Tai Tham": range(6688, 6832),
87
+ "Combining Diacritical Marks Extended": range(6832, 6912),
88
+ "Balinese": range(6912, 7040),
89
+ "Sundanese": range(7040, 7104),
90
+ "Batak": range(7104, 7168),
91
+ "Lepcha": range(7168, 7248),
92
+ "Ol Chiki": range(7248, 7296),
93
+ "Cyrillic Extended-C": range(7296, 7312),
94
+ "Georgian Extended": range(7312, 7360),
95
+ "Sundanese Supplement": range(7360, 7376),
96
+ "Vedic Extensions": range(7376, 7424),
97
+ "Phonetic Extensions": range(7424, 7552),
98
+ "Phonetic Extensions Supplement": range(7552, 7616),
99
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
100
+ "Latin Extended Additional": range(7680, 7936),
101
+ "Greek Extended": range(7936, 8192),
102
+ "General Punctuation": range(8192, 8304),
103
+ "Superscripts and Subscripts": range(8304, 8352),
104
+ "Currency Symbols": range(8352, 8400),
105
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
106
+ "Letterlike Symbols": range(8448, 8528),
107
+ "Number Forms": range(8528, 8592),
108
+ "Arrows": range(8592, 8704),
109
+ "Mathematical Operators": range(8704, 8960),
110
+ "Miscellaneous Technical": range(8960, 9216),
111
+ "Control Pictures": range(9216, 9280),
112
+ "Optical Character Recognition": range(9280, 9312),
113
+ "Enclosed Alphanumerics": range(9312, 9472),
114
+ "Box Drawing": range(9472, 9600),
115
+ "Block Elements": range(9600, 9632),
116
+ "Geometric Shapes": range(9632, 9728),
117
+ "Miscellaneous Symbols": range(9728, 9984),
118
+ "Dingbats": range(9984, 10176),
119
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
120
+ "Supplemental Arrows-A": range(10224, 10240),
121
+ "Braille Patterns": range(10240, 10496),
122
+ "Supplemental Arrows-B": range(10496, 10624),
123
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
124
+ "Supplemental Mathematical Operators": range(10752, 11008),
125
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
126
+ "Glagolitic": range(11264, 11360),
127
+ "Latin Extended-C": range(11360, 11392),
128
+ "Coptic": range(11392, 11520),
129
+ "Georgian Supplement": range(11520, 11568),
130
+ "Tifinagh": range(11568, 11648),
131
+ "Ethiopic Extended": range(11648, 11744),
132
+ "Cyrillic Extended-A": range(11744, 11776),
133
+ "Supplemental Punctuation": range(11776, 11904),
134
+ "CJK Radicals Supplement": range(11904, 12032),
135
+ "Kangxi Radicals": range(12032, 12256),
136
+ "Ideographic Description Characters": range(12272, 12288),
137
+ "CJK Symbols and Punctuation": range(12288, 12352),
138
+ "Hiragana": range(12352, 12448),
139
+ "Katakana": range(12448, 12544),
140
+ "Bopomofo": range(12544, 12592),
141
+ "Hangul Compatibility Jamo": range(12592, 12688),
142
+ "Kanbun": range(12688, 12704),
143
+ "Bopomofo Extended": range(12704, 12736),
144
+ "CJK Strokes": range(12736, 12784),
145
+ "Katakana Phonetic Extensions": range(12784, 12800),
146
+ "Enclosed CJK Letters and Months": range(12800, 13056),
147
+ "CJK Compatibility": range(13056, 13312),
148
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
149
+ "Yijing Hexagram Symbols": range(19904, 19968),
150
+ "CJK Unified Ideographs": range(19968, 40960),
151
+ "Yi Syllables": range(40960, 42128),
152
+ "Yi Radicals": range(42128, 42192),
153
+ "Lisu": range(42192, 42240),
154
+ "Vai": range(42240, 42560),
155
+ "Cyrillic Extended-B": range(42560, 42656),
156
+ "Bamum": range(42656, 42752),
157
+ "Modifier Tone Letters": range(42752, 42784),
158
+ "Latin Extended-D": range(42784, 43008),
159
+ "Syloti Nagri": range(43008, 43056),
160
+ "Common Indic Number Forms": range(43056, 43072),
161
+ "Phags-pa": range(43072, 43136),
162
+ "Saurashtra": range(43136, 43232),
163
+ "Devanagari Extended": range(43232, 43264),
164
+ "Kayah Li": range(43264, 43312),
165
+ "Rejang": range(43312, 43360),
166
+ "Hangul Jamo Extended-A": range(43360, 43392),
167
+ "Javanese": range(43392, 43488),
168
+ "Myanmar Extended-B": range(43488, 43520),
169
+ "Cham": range(43520, 43616),
170
+ "Myanmar Extended-A": range(43616, 43648),
171
+ "Tai Viet": range(43648, 43744),
172
+ "Meetei Mayek Extensions": range(43744, 43776),
173
+ "Ethiopic Extended-A": range(43776, 43824),
174
+ "Latin Extended-E": range(43824, 43888),
175
+ "Cherokee Supplement": range(43888, 43968),
176
+ "Meetei Mayek": range(43968, 44032),
177
+ "Hangul Syllables": range(44032, 55216),
178
+ "Hangul Jamo Extended-B": range(55216, 55296),
179
+ "High Surrogates": range(55296, 56192),
180
+ "High Private Use Surrogates": range(56192, 56320),
181
+ "Low Surrogates": range(56320, 57344),
182
+ "Private Use Area": range(57344, 63744),
183
+ "CJK Compatibility Ideographs": range(63744, 64256),
184
+ "Alphabetic Presentation Forms": range(64256, 64336),
185
+ "Arabic Presentation Forms-A": range(64336, 65024),
186
+ "Variation Selectors": range(65024, 65040),
187
+ "Vertical Forms": range(65040, 65056),
188
+ "Combining Half Marks": range(65056, 65072),
189
+ "CJK Compatibility Forms": range(65072, 65104),
190
+ "Small Form Variants": range(65104, 65136),
191
+ "Arabic Presentation Forms-B": range(65136, 65280),
192
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
193
+ "Specials": range(65520, 65536),
194
+ "Linear B Syllabary": range(65536, 65664),
195
+ "Linear B Ideograms": range(65664, 65792),
196
+ "Aegean Numbers": range(65792, 65856),
197
+ "Ancient Greek Numbers": range(65856, 65936),
198
+ "Ancient Symbols": range(65936, 66000),
199
+ "Phaistos Disc": range(66000, 66048),
200
+ "Lycian": range(66176, 66208),
201
+ "Carian": range(66208, 66272),
202
+ "Coptic Epact Numbers": range(66272, 66304),
203
+ "Old Italic": range(66304, 66352),
204
+ "Gothic": range(66352, 66384),
205
+ "Old Permic": range(66384, 66432),
206
+ "Ugaritic": range(66432, 66464),
207
+ "Old Persian": range(66464, 66528),
208
+ "Deseret": range(66560, 66640),
209
+ "Shavian": range(66640, 66688),
210
+ "Osmanya": range(66688, 66736),
211
+ "Osage": range(66736, 66816),
212
+ "Elbasan": range(66816, 66864),
213
+ "Caucasian Albanian": range(66864, 66928),
214
+ "Vithkuqi": range(66928, 67008),
215
+ "Linear A": range(67072, 67456),
216
+ "Latin Extended-F": range(67456, 67520),
217
+ "Cypriot Syllabary": range(67584, 67648),
218
+ "Imperial Aramaic": range(67648, 67680),
219
+ "Palmyrene": range(67680, 67712),
220
+ "Nabataean": range(67712, 67760),
221
+ "Hatran": range(67808, 67840),
222
+ "Phoenician": range(67840, 67872),
223
+ "Lydian": range(67872, 67904),
224
+ "Meroitic Hieroglyphs": range(67968, 68000),
225
+ "Meroitic Cursive": range(68000, 68096),
226
+ "Kharoshthi": range(68096, 68192),
227
+ "Old South Arabian": range(68192, 68224),
228
+ "Old North Arabian": range(68224, 68256),
229
+ "Manichaean": range(68288, 68352),
230
+ "Avestan": range(68352, 68416),
231
+ "Inscriptional Parthian": range(68416, 68448),
232
+ "Inscriptional Pahlavi": range(68448, 68480),
233
+ "Psalter Pahlavi": range(68480, 68528),
234
+ "Old Turkic": range(68608, 68688),
235
+ "Old Hungarian": range(68736, 68864),
236
+ "Hanifi Rohingya": range(68864, 68928),
237
+ "Rumi Numeral Symbols": range(69216, 69248),
238
+ "Yezidi": range(69248, 69312),
239
+ "Arabic Extended-C": range(69312, 69376),
240
+ "Old Sogdian": range(69376, 69424),
241
+ "Sogdian": range(69424, 69488),
242
+ "Old Uyghur": range(69488, 69552),
243
+ "Chorasmian": range(69552, 69600),
244
+ "Elymaic": range(69600, 69632),
245
+ "Brahmi": range(69632, 69760),
246
+ "Kaithi": range(69760, 69840),
247
+ "Sora Sompeng": range(69840, 69888),
248
+ "Chakma": range(69888, 69968),
249
+ "Mahajani": range(69968, 70016),
250
+ "Sharada": range(70016, 70112),
251
+ "Sinhala Archaic Numbers": range(70112, 70144),
252
+ "Khojki": range(70144, 70224),
253
+ "Multani": range(70272, 70320),
254
+ "Khudawadi": range(70320, 70400),
255
+ "Grantha": range(70400, 70528),
256
+ "Newa": range(70656, 70784),
257
+ "Tirhuta": range(70784, 70880),
258
+ "Siddham": range(71040, 71168),
259
+ "Modi": range(71168, 71264),
260
+ "Mongolian Supplement": range(71264, 71296),
261
+ "Takri": range(71296, 71376),
262
+ "Ahom": range(71424, 71504),
263
+ "Dogra": range(71680, 71760),
264
+ "Warang Citi": range(71840, 71936),
265
+ "Dives Akuru": range(71936, 72032),
266
+ "Nandinagari": range(72096, 72192),
267
+ "Zanabazar Square": range(72192, 72272),
268
+ "Soyombo": range(72272, 72368),
269
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
270
+ "Pau Cin Hau": range(72384, 72448),
271
+ "Devanagari Extended-A": range(72448, 72544),
272
+ "Bhaiksuki": range(72704, 72816),
273
+ "Marchen": range(72816, 72896),
274
+ "Masaram Gondi": range(72960, 73056),
275
+ "Gunjala Gondi": range(73056, 73136),
276
+ "Makasar": range(73440, 73472),
277
+ "Kawi": range(73472, 73568),
278
+ "Lisu Supplement": range(73648, 73664),
279
+ "Tamil Supplement": range(73664, 73728),
280
+ "Cuneiform": range(73728, 74752),
281
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
282
+ "Early Dynastic Cuneiform": range(74880, 75088),
283
+ "Cypro-Minoan": range(77712, 77824),
284
+ "Egyptian Hieroglyphs": range(77824, 78896),
285
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
286
+ "Anatolian Hieroglyphs": range(82944, 83584),
287
+ "Bamum Supplement": range(92160, 92736),
288
+ "Mro": range(92736, 92784),
289
+ "Tangsa": range(92784, 92880),
290
+ "Bassa Vah": range(92880, 92928),
291
+ "Pahawh Hmong": range(92928, 93072),
292
+ "Medefaidrin": range(93760, 93856),
293
+ "Miao": range(93952, 94112),
294
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
295
+ "Tangut": range(94208, 100352),
296
+ "Tangut Components": range(100352, 101120),
297
+ "Khitan Small Script": range(101120, 101632),
298
+ "Tangut Supplement": range(101632, 101760),
299
+ "Kana Extended-B": range(110576, 110592),
300
+ "Kana Supplement": range(110592, 110848),
301
+ "Kana Extended-A": range(110848, 110896),
302
+ "Small Kana Extension": range(110896, 110960),
303
+ "Nushu": range(110960, 111360),
304
+ "Duployan": range(113664, 113824),
305
+ "Shorthand Format Controls": range(113824, 113840),
306
+ "Znamenny Musical Notation": range(118528, 118736),
307
+ "Byzantine Musical Symbols": range(118784, 119040),
308
+ "Musical Symbols": range(119040, 119296),
309
+ "Ancient Greek Musical Notation": range(119296, 119376),
310
+ "Kaktovik Numerals": range(119488, 119520),
311
+ "Mayan Numerals": range(119520, 119552),
312
+ "Tai Xuan Jing Symbols": range(119552, 119648),
313
+ "Counting Rod Numerals": range(119648, 119680),
314
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
315
+ "Sutton SignWriting": range(120832, 121520),
316
+ "Latin Extended-G": range(122624, 122880),
317
+ "Glagolitic Supplement": range(122880, 122928),
318
+ "Cyrillic Extended-D": range(122928, 123024),
319
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
320
+ "Toto": range(123536, 123584),
321
+ "Wancho": range(123584, 123648),
322
+ "Nag Mundari": range(124112, 124160),
323
+ "Ethiopic Extended-B": range(124896, 124928),
324
+ "Mende Kikakui": range(124928, 125152),
325
+ "Adlam": range(125184, 125280),
326
+ "Indic Siyaq Numbers": range(126064, 126144),
327
+ "Ottoman Siyaq Numbers": range(126208, 126288),
328
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
329
+ "Mahjong Tiles": range(126976, 127024),
330
+ "Domino Tiles": range(127024, 127136),
331
+ "Playing Cards": range(127136, 127232),
332
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
333
+ "Enclosed Ideographic Supplement": range(127488, 127744),
334
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
335
+ "Emoticons range(Emoji)": range(128512, 128592),
336
+ "Ornamental Dingbats": range(128592, 128640),
337
+ "Transport and Map Symbols": range(128640, 128768),
338
+ "Alchemical Symbols": range(128768, 128896),
339
+ "Geometric Shapes Extended": range(128896, 129024),
340
+ "Supplemental Arrows-C": range(129024, 129280),
341
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
342
+ "Chess Symbols": range(129536, 129648),
343
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
344
+ "Symbols for Legacy Computing": range(129792, 130048),
345
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
346
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
347
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
348
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
349
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
350
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
351
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
352
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
353
+ "Tags": range(917504, 917632),
354
+ "Variation Selectors Supplement": range(917760, 918000),
355
+ "Supplementary Private Use Area-A": range(983040, 1048576),
356
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
357
+ }
358
+
359
+
360
+ UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
361
+ "Supplement",
362
+ "Extended",
363
+ "Extensions",
364
+ "Modifier",
365
+ "Marks",
366
+ "Punctuation",
367
+ "Symbols",
368
+ "Forms",
369
+ "Operators",
370
+ "Miscellaneous",
371
+ "Drawing",
372
+ "Block",
373
+ "Shapes",
374
+ "Supplemental",
375
+ "Tags",
376
+ ]
377
+
378
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
379
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
380
+ IGNORECASE,
381
+ )
382
+
383
+ IANA_NO_ALIASES = [
384
+ "cp720",
385
+ "cp737",
386
+ "cp856",
387
+ "cp874",
388
+ "cp875",
389
+ "cp1006",
390
+ "koi8_r",
391
+ "koi8_t",
392
+ "koi8_u",
393
+ ]
394
+
395
+ IANA_SUPPORTED: List[str] = sorted(
396
+ filter(
397
+ lambda x: x.endswith("_codec") is False
398
+ and x not in {"rot_13", "tactis", "mbcs"},
399
+ list(set(aliases.values())) + IANA_NO_ALIASES,
400
+ )
401
+ )
402
+
403
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
404
+
405
+ # pre-computed code page that are similar using the function cp_similarity.
406
+ IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
407
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
408
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
409
+ "cp1125": ["cp866"],
410
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
411
+ "cp1250": ["iso8859_2"],
412
+ "cp1251": ["kz1048", "ptcp154"],
413
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
414
+ "cp1253": ["iso8859_7"],
415
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
416
+ "cp1257": ["iso8859_13"],
417
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
418
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
419
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
420
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
421
+ "cp857": ["cp850", "cp858", "cp865"],
422
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
423
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
424
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
425
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
426
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
427
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
428
+ "cp866": ["cp1125"],
429
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
430
+ "iso8859_11": ["tis_620"],
431
+ "iso8859_13": ["cp1257"],
432
+ "iso8859_14": [
433
+ "iso8859_10",
434
+ "iso8859_15",
435
+ "iso8859_16",
436
+ "iso8859_3",
437
+ "iso8859_9",
438
+ "latin_1",
439
+ ],
440
+ "iso8859_15": [
441
+ "cp1252",
442
+ "cp1254",
443
+ "iso8859_10",
444
+ "iso8859_14",
445
+ "iso8859_16",
446
+ "iso8859_3",
447
+ "iso8859_9",
448
+ "latin_1",
449
+ ],
450
+ "iso8859_16": [
451
+ "iso8859_14",
452
+ "iso8859_15",
453
+ "iso8859_2",
454
+ "iso8859_3",
455
+ "iso8859_9",
456
+ "latin_1",
457
+ ],
458
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
459
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
460
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
461
+ "iso8859_7": ["cp1253"],
462
+ "iso8859_9": [
463
+ "cp1252",
464
+ "cp1254",
465
+ "cp1258",
466
+ "iso8859_10",
467
+ "iso8859_14",
468
+ "iso8859_15",
469
+ "iso8859_16",
470
+ "iso8859_3",
471
+ "iso8859_4",
472
+ "latin_1",
473
+ ],
474
+ "kz1048": ["cp1251", "ptcp154"],
475
+ "latin_1": [
476
+ "cp1252",
477
+ "cp1254",
478
+ "cp1258",
479
+ "iso8859_10",
480
+ "iso8859_14",
481
+ "iso8859_15",
482
+ "iso8859_16",
483
+ "iso8859_3",
484
+ "iso8859_4",
485
+ "iso8859_9",
486
+ ],
487
+ "mac_iceland": ["mac_roman", "mac_turkish"],
488
+ "mac_roman": ["mac_iceland", "mac_turkish"],
489
+ "mac_turkish": ["mac_iceland", "mac_roman"],
490
+ "ptcp154": ["cp1251", "kz1048"],
491
+ "tis_620": ["iso8859_11"],
492
+ }
493
+
494
+
495
+ CHARDET_CORRESPONDENCE: Dict[str, str] = {
496
+ "iso2022_kr": "ISO-2022-KR",
497
+ "iso2022_jp": "ISO-2022-JP",
498
+ "euc_kr": "EUC-KR",
499
+ "tis_620": "TIS-620",
500
+ "utf_32": "UTF-32",
501
+ "euc_jp": "EUC-JP",
502
+ "koi8_r": "KOI8-R",
503
+ "iso8859_1": "ISO-8859-1",
504
+ "iso8859_2": "ISO-8859-2",
505
+ "iso8859_5": "ISO-8859-5",
506
+ "iso8859_6": "ISO-8859-6",
507
+ "iso8859_7": "ISO-8859-7",
508
+ "iso8859_8": "ISO-8859-8",
509
+ "utf_16": "UTF-16",
510
+ "cp855": "IBM855",
511
+ "mac_cyrillic": "MacCyrillic",
512
+ "gb2312": "GB2312",
513
+ "gb18030": "GB18030",
514
+ "cp932": "CP932",
515
+ "cp866": "IBM866",
516
+ "utf_8": "utf-8",
517
+ "utf_8_sig": "UTF-8-SIG",
518
+ "shift_jis": "SHIFT_JIS",
519
+ "big5": "Big5",
520
+ "cp1250": "windows-1250",
521
+ "cp1251": "windows-1251",
522
+ "cp1252": "Windows-1252",
523
+ "cp1253": "windows-1253",
524
+ "cp1255": "windows-1255",
525
+ "cp1256": "windows-1256",
526
+ "cp1254": "Windows-1254",
527
+ "cp949": "CP949",
528
+ }
529
+
530
+
531
+ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
532
+ "<",
533
+ ">",
534
+ "=",
535
+ ":",
536
+ "/",
537
+ "&",
538
+ ";",
539
+ "{",
540
+ "}",
541
+ "[",
542
+ "]",
543
+ ",",
544
+ "|",
545
+ '"',
546
+ "-",
547
+ }
548
+
549
+
550
+ KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
551
+ ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
552
+
553
+ # Logging LEVEL below DEBUG
554
+ TRACE: int = 5
555
+
556
+
557
+ # Language label that contain the em dash "—"
558
+ # character are to be considered alternative seq to origin
559
+ FREQUENCIES: Dict[str, List[str]] = {
560
+ "English": [
561
+ "e",
562
+ "a",
563
+ "t",
564
+ "i",
565
+ "o",
566
+ "n",
567
+ "s",
568
+ "r",
569
+ "h",
570
+ "l",
571
+ "d",
572
+ "c",
573
+ "u",
574
+ "m",
575
+ "f",
576
+ "p",
577
+ "g",
578
+ "w",
579
+ "y",
580
+ "b",
581
+ "v",
582
+ "k",
583
+ "x",
584
+ "j",
585
+ "z",
586
+ "q",
587
+ ],
588
+ "English—": [
589
+ "e",
590
+ "a",
591
+ "t",
592
+ "i",
593
+ "o",
594
+ "n",
595
+ "s",
596
+ "r",
597
+ "h",
598
+ "l",
599
+ "d",
600
+ "c",
601
+ "m",
602
+ "u",
603
+ "f",
604
+ "p",
605
+ "g",
606
+ "w",
607
+ "b",
608
+ "y",
609
+ "v",
610
+ "k",
611
+ "j",
612
+ "x",
613
+ "z",
614
+ "q",
615
+ ],
616
+ "German": [
617
+ "e",
618
+ "n",
619
+ "i",
620
+ "r",
621
+ "s",
622
+ "t",
623
+ "a",
624
+ "d",
625
+ "h",
626
+ "u",
627
+ "l",
628
+ "g",
629
+ "o",
630
+ "c",
631
+ "m",
632
+ "b",
633
+ "f",
634
+ "k",
635
+ "w",
636
+ "z",
637
+ "p",
638
+ "v",
639
+ "ü",
640
+ "ä",
641
+ "ö",
642
+ "j",
643
+ ],
644
+ "French": [
645
+ "e",
646
+ "a",
647
+ "s",
648
+ "n",
649
+ "i",
650
+ "t",
651
+ "r",
652
+ "l",
653
+ "u",
654
+ "o",
655
+ "d",
656
+ "c",
657
+ "p",
658
+ "m",
659
+ "é",
660
+ "v",
661
+ "g",
662
+ "f",
663
+ "b",
664
+ "h",
665
+ "q",
666
+ "à",
667
+ "x",
668
+ "è",
669
+ "y",
670
+ "j",
671
+ ],
672
+ "Dutch": [
673
+ "e",
674
+ "n",
675
+ "a",
676
+ "i",
677
+ "r",
678
+ "t",
679
+ "o",
680
+ "d",
681
+ "s",
682
+ "l",
683
+ "g",
684
+ "h",
685
+ "v",
686
+ "m",
687
+ "u",
688
+ "k",
689
+ "c",
690
+ "p",
691
+ "b",
692
+ "w",
693
+ "j",
694
+ "z",
695
+ "f",
696
+ "y",
697
+ "x",
698
+ "ë",
699
+ ],
700
+ "Italian": [
701
+ "e",
702
+ "i",
703
+ "a",
704
+ "o",
705
+ "n",
706
+ "l",
707
+ "t",
708
+ "r",
709
+ "s",
710
+ "c",
711
+ "d",
712
+ "u",
713
+ "p",
714
+ "m",
715
+ "g",
716
+ "v",
717
+ "f",
718
+ "b",
719
+ "z",
720
+ "h",
721
+ "q",
722
+ "è",
723
+ "à",
724
+ "k",
725
+ "y",
726
+ "ò",
727
+ ],
728
+ "Polish": [
729
+ "a",
730
+ "i",
731
+ "o",
732
+ "e",
733
+ "n",
734
+ "r",
735
+ "z",
736
+ "w",
737
+ "s",
738
+ "c",
739
+ "t",
740
+ "k",
741
+ "y",
742
+ "d",
743
+ "p",
744
+ "m",
745
+ "u",
746
+ "l",
747
+ "j",
748
+ "ł",
749
+ "g",
750
+ "b",
751
+ "h",
752
+ "ą",
753
+ "ę",
754
+ "ó",
755
+ ],
756
+ "Spanish": [
757
+ "e",
758
+ "a",
759
+ "o",
760
+ "n",
761
+ "s",
762
+ "r",
763
+ "i",
764
+ "l",
765
+ "d",
766
+ "t",
767
+ "c",
768
+ "u",
769
+ "m",
770
+ "p",
771
+ "b",
772
+ "g",
773
+ "v",
774
+ "f",
775
+ "y",
776
+ "ó",
777
+ "h",
778
+ "q",
779
+ "í",
780
+ "j",
781
+ "z",
782
+ "á",
783
+ ],
784
+ "Russian": [
785
+ "о",
786
+ "а",
787
+ "е",
788
+ "и",
789
+ "н",
790
+ "с",
791
+ "т",
792
+ "р",
793
+ "в",
794
+ "л",
795
+ "к",
796
+ "м",
797
+ "д",
798
+ "п",
799
+ "у",
800
+ "г",
801
+ "я",
802
+ "ы",
803
+ "з",
804
+ "б",
805
+ "й",
806
+ "ь",
807
+ "ч",
808
+ "х",
809
+ "ж",
810
+ "ц",
811
+ ],
812
+ # Jap-Kanji
813
+ "Japanese": [
814
+ "人",
815
+ "一",
816
+ "大",
817
+ "亅",
818
+ "丁",
819
+ "丨",
820
+ "竹",
821
+ "笑",
822
+ "口",
823
+ "日",
824
+ "今",
825
+ "二",
826
+ "彳",
827
+ "行",
828
+ "十",
829
+ "土",
830
+ "丶",
831
+ "寸",
832
+ "寺",
833
+ "時",
834
+ "乙",
835
+ "丿",
836
+ "乂",
837
+ "气",
838
+ "気",
839
+ "冂",
840
+ "巾",
841
+ "亠",
842
+ "市",
843
+ "目",
844
+ "儿",
845
+ "見",
846
+ "八",
847
+ "小",
848
+ "凵",
849
+ "県",
850
+ "月",
851
+ "彐",
852
+ "門",
853
+ "間",
854
+ "木",
855
+ "東",
856
+ "山",
857
+ "出",
858
+ "本",
859
+ "中",
860
+ "刀",
861
+ "分",
862
+ "耳",
863
+ "又",
864
+ "取",
865
+ "最",
866
+ "言",
867
+ "田",
868
+ "心",
869
+ "思",
870
+ "刂",
871
+ "前",
872
+ "京",
873
+ "尹",
874
+ "事",
875
+ "生",
876
+ "厶",
877
+ "云",
878
+ "会",
879
+ "未",
880
+ "来",
881
+ "白",
882
+ "冫",
883
+ "楽",
884
+ "灬",
885
+ "馬",
886
+ "尸",
887
+ "尺",
888
+ "駅",
889
+ "明",
890
+ "耂",
891
+ "者",
892
+ "了",
893
+ "阝",
894
+ "都",
895
+ "高",
896
+ "卜",
897
+ "占",
898
+ "厂",
899
+ "广",
900
+ "店",
901
+ "子",
902
+ "申",
903
+ "奄",
904
+ "亻",
905
+ "俺",
906
+ "上",
907
+ "方",
908
+ "冖",
909
+ "学",
910
+ "衣",
911
+ "艮",
912
+ "食",
913
+ "自",
914
+ ],
915
+ # Jap-Katakana
916
+ "Japanese—": [
917
+ "ー",
918
+ "ン",
919
+ "ス",
920
+ "・",
921
+ "ル",
922
+ "ト",
923
+ "リ",
924
+ "イ",
925
+ "ア",
926
+ "ラ",
927
+ "ッ",
928
+ "ク",
929
+ "ド",
930
+ "シ",
931
+ "レ",
932
+ "ジ",
933
+ "タ",
934
+ "フ",
935
+ "ロ",
936
+ "カ",
937
+ "テ",
938
+ "マ",
939
+ "ィ",
940
+ "グ",
941
+ "バ",
942
+ "ム",
943
+ "プ",
944
+ "オ",
945
+ "コ",
946
+ "デ",
947
+ "ニ",
948
+ "ウ",
949
+ "メ",
950
+ "サ",
951
+ "ビ",
952
+ "ナ",
953
+ "ブ",
954
+ "ャ",
955
+ "エ",
956
+ "ュ",
957
+ "チ",
958
+ "キ",
959
+ "ズ",
960
+ "ダ",
961
+ "パ",
962
+ "ミ",
963
+ "ェ",
964
+ "ョ",
965
+ "ハ",
966
+ "セ",
967
+ "ベ",
968
+ "ガ",
969
+ "モ",
970
+ "ツ",
971
+ "ネ",
972
+ "ボ",
973
+ "ソ",
974
+ "ノ",
975
+ "ァ",
976
+ "ヴ",
977
+ "ワ",
978
+ "ポ",
979
+ "ペ",
980
+ "ピ",
981
+ "ケ",
982
+ "ゴ",
983
+ "ギ",
984
+ "ザ",
985
+ "ホ",
986
+ "ゲ",
987
+ "ォ",
988
+ "ヤ",
989
+ "ヒ",
990
+ "ユ",
991
+ "ヨ",
992
+ "ヘ",
993
+ "ゼ",
994
+ "ヌ",
995
+ "ゥ",
996
+ "ゾ",
997
+ "ヶ",
998
+ "ヂ",
999
+ "ヲ",
1000
+ "ヅ",
1001
+ "ヵ",
1002
+ "ヱ",
1003
+ "ヰ",
1004
+ "ヮ",
1005
+ "ヽ",
1006
+ "゠",
1007
+ "ヾ",
1008
+ "ヷ",
1009
+ "ヿ",
1010
+ "ヸ",
1011
+ "ヹ",
1012
+ "ヺ",
1013
+ ],
1014
+ # Jap-Hiragana
1015
+ "Japanese——": [
1016
+ "の",
1017
+ "に",
1018
+ "る",
1019
+ "た",
1020
+ "と",
1021
+ "は",
1022
+ "し",
1023
+ "い",
1024
+ "を",
1025
+ "で",
1026
+ "て",
1027
+ "が",
1028
+ "な",
1029
+ "れ",
1030
+ "か",
1031
+ "ら",
1032
+ "さ",
1033
+ "っ",
1034
+ "り",
1035
+ "す",
1036
+ "あ",
1037
+ "も",
1038
+ "こ",
1039
+ "ま",
1040
+ "う",
1041
+ "く",
1042
+ "よ",
1043
+ "き",
1044
+ "ん",
1045
+ "め",
1046
+ "お",
1047
+ "け",
1048
+ "そ",
1049
+ "つ",
1050
+ "だ",
1051
+ "や",
1052
+ "え",
1053
+ "ど",
1054
+ "わ",
1055
+ "ち",
1056
+ "み",
1057
+ "せ",
1058
+ "じ",
1059
+ "ば",
1060
+ "へ",
1061
+ "び",
1062
+ "ず",
1063
+ "ろ",
1064
+ "ほ",
1065
+ "げ",
1066
+ "む",
1067
+ "べ",
1068
+ "ひ",
1069
+ "ょ",
1070
+ "ゆ",
1071
+ "ぶ",
1072
+ "ご",
1073
+ "ゃ",
1074
+ "ね",
1075
+ "ふ",
1076
+ "ぐ",
1077
+ "ぎ",
1078
+ "ぼ",
1079
+ "ゅ",
1080
+ "づ",
1081
+ "ざ",
1082
+ "ぞ",
1083
+ "ぬ",
1084
+ "ぜ",
1085
+ "ぱ",
1086
+ "ぽ",
1087
+ "ぷ",
1088
+ "ぴ",
1089
+ "ぃ",
1090
+ "ぁ",
1091
+ "ぇ",
1092
+ "ぺ",
1093
+ "ゞ",
1094
+ "ぢ",
1095
+ "ぉ",
1096
+ "ぅ",
1097
+ "ゐ",
1098
+ "ゝ",
1099
+ "ゑ",
1100
+ "゛",
1101
+ "゜",
1102
+ "ゎ",
1103
+ "ゔ",
1104
+ "゚",
1105
+ "ゟ",
1106
+ "゙",
1107
+ "ゕ",
1108
+ "ゖ",
1109
+ ],
1110
+ "Portuguese": [
1111
+ "a",
1112
+ "e",
1113
+ "o",
1114
+ "s",
1115
+ "i",
1116
+ "r",
1117
+ "d",
1118
+ "n",
1119
+ "t",
1120
+ "m",
1121
+ "u",
1122
+ "c",
1123
+ "l",
1124
+ "p",
1125
+ "g",
1126
+ "v",
1127
+ "b",
1128
+ "f",
1129
+ "h",
1130
+ "ã",
1131
+ "q",
1132
+ "é",
1133
+ "ç",
1134
+ "á",
1135
+ "z",
1136
+ "í",
1137
+ ],
1138
+ "Swedish": [
1139
+ "e",
1140
+ "a",
1141
+ "n",
1142
+ "r",
1143
+ "t",
1144
+ "s",
1145
+ "i",
1146
+ "l",
1147
+ "d",
1148
+ "o",
1149
+ "m",
1150
+ "k",
1151
+ "g",
1152
+ "v",
1153
+ "h",
1154
+ "f",
1155
+ "u",
1156
+ "p",
1157
+ "ä",
1158
+ "c",
1159
+ "b",
1160
+ "ö",
1161
+ "å",
1162
+ "y",
1163
+ "j",
1164
+ "x",
1165
+ ],
1166
+ "Chinese": [
1167
+ "的",
1168
+ "一",
1169
+ "是",
1170
+ "不",
1171
+ "了",
1172
+ "在",
1173
+ "人",
1174
+ "有",
1175
+ "我",
1176
+ "他",
1177
+ "这",
1178
+ "个",
1179
+ "们",
1180
+ "中",
1181
+ "来",
1182
+ "上",
1183
+ "大",
1184
+ "为",
1185
+ "和",
1186
+ "国",
1187
+ "地",
1188
+ "到",
1189
+ "以",
1190
+ "说",
1191
+ "时",
1192
+ "要",
1193
+ "就",
1194
+ "出",
1195
+ "会",
1196
+ "可",
1197
+ "也",
1198
+ "你",
1199
+ "对",
1200
+ "生",
1201
+ "能",
1202
+ "而",
1203
+ "子",
1204
+ "那",
1205
+ "得",
1206
+ "于",
1207
+ "着",
1208
+ "下",
1209
+ "自",
1210
+ "之",
1211
+ "年",
1212
+ "过",
1213
+ "发",
1214
+ "后",
1215
+ "作",
1216
+ "里",
1217
+ "用",
1218
+ "道",
1219
+ "行",
1220
+ "所",
1221
+ "然",
1222
+ "家",
1223
+ "种",
1224
+ "事",
1225
+ "成",
1226
+ "方",
1227
+ "多",
1228
+ "经",
1229
+ "么",
1230
+ "去",
1231
+ "法",
1232
+ "学",
1233
+ "如",
1234
+ "都",
1235
+ "同",
1236
+ "现",
1237
+ "当",
1238
+ "没",
1239
+ "动",
1240
+ "面",
1241
+ "起",
1242
+ "看",
1243
+ "定",
1244
+ "天",
1245
+ "分",
1246
+ "还",
1247
+ "进",
1248
+ "好",
1249
+ "小",
1250
+ "部",
1251
+ "其",
1252
+ "些",
1253
+ "主",
1254
+ "样",
1255
+ "理",
1256
+ "心",
1257
+ "她",
1258
+ "本",
1259
+ "前",
1260
+ "开",
1261
+ "但",
1262
+ "因",
1263
+ "只",
1264
+ "从",
1265
+ "想",
1266
+ "实",
1267
+ ],
1268
+ "Ukrainian": [
1269
+ "о",
1270
+ "а",
1271
+ "н",
1272
+ "і",
1273
+ "и",
1274
+ "р",
1275
+ "в",
1276
+ "т",
1277
+ "е",
1278
+ "с",
1279
+ "к",
1280
+ "л",
1281
+ "у",
1282
+ "д",
1283
+ "м",
1284
+ "п",
1285
+ "з",
1286
+ "я",
1287
+ "ь",
1288
+ "б",
1289
+ "г",
1290
+ "й",
1291
+ "ч",
1292
+ "х",
1293
+ "ц",
1294
+ "ї",
1295
+ ],
1296
+ "Norwegian": [
1297
+ "e",
1298
+ "r",
1299
+ "n",
1300
+ "t",
1301
+ "a",
1302
+ "s",
1303
+ "i",
1304
+ "o",
1305
+ "l",
1306
+ "d",
1307
+ "g",
1308
+ "k",
1309
+ "m",
1310
+ "v",
1311
+ "f",
1312
+ "p",
1313
+ "u",
1314
+ "b",
1315
+ "h",
1316
+ "å",
1317
+ "y",
1318
+ "j",
1319
+ "ø",
1320
+ "c",
1321
+ "æ",
1322
+ "w",
1323
+ ],
1324
+ "Finnish": [
1325
+ "a",
1326
+ "i",
1327
+ "n",
1328
+ "t",
1329
+ "e",
1330
+ "s",
1331
+ "l",
1332
+ "o",
1333
+ "u",
1334
+ "k",
1335
+ "ä",
1336
+ "m",
1337
+ "r",
1338
+ "v",
1339
+ "j",
1340
+ "h",
1341
+ "p",
1342
+ "y",
1343
+ "d",
1344
+ "ö",
1345
+ "g",
1346
+ "c",
1347
+ "b",
1348
+ "f",
1349
+ "w",
1350
+ "z",
1351
+ ],
1352
+ "Vietnamese": [
1353
+ "n",
1354
+ "h",
1355
+ "t",
1356
+ "i",
1357
+ "c",
1358
+ "g",
1359
+ "a",
1360
+ "o",
1361
+ "u",
1362
+ "m",
1363
+ "l",
1364
+ "r",
1365
+ "à",
1366
+ "đ",
1367
+ "s",
1368
+ "e",
1369
+ "v",
1370
+ "p",
1371
+ "b",
1372
+ "y",
1373
+ "ư",
1374
+ "d",
1375
+ "á",
1376
+ "k",
1377
+ "ộ",
1378
+ "ế",
1379
+ ],
1380
+ "Czech": [
1381
+ "o",
1382
+ "e",
1383
+ "a",
1384
+ "n",
1385
+ "t",
1386
+ "s",
1387
+ "i",
1388
+ "l",
1389
+ "v",
1390
+ "r",
1391
+ "k",
1392
+ "d",
1393
+ "u",
1394
+ "m",
1395
+ "p",
1396
+ "í",
1397
+ "c",
1398
+ "h",
1399
+ "z",
1400
+ "á",
1401
+ "y",
1402
+ "j",
1403
+ "b",
1404
+ "ě",
1405
+ "é",
1406
+ "ř",
1407
+ ],
1408
+ "Hungarian": [
1409
+ "e",
1410
+ "a",
1411
+ "t",
1412
+ "l",
1413
+ "s",
1414
+ "n",
1415
+ "k",
1416
+ "r",
1417
+ "i",
1418
+ "o",
1419
+ "z",
1420
+ "á",
1421
+ "é",
1422
+ "g",
1423
+ "m",
1424
+ "b",
1425
+ "y",
1426
+ "v",
1427
+ "d",
1428
+ "h",
1429
+ "u",
1430
+ "p",
1431
+ "j",
1432
+ "ö",
1433
+ "f",
1434
+ "c",
1435
+ ],
1436
+ "Korean": [
1437
+ "이",
1438
+ "다",
1439
+ "에",
1440
+ "의",
1441
+ "는",
1442
+ "로",
1443
+ "하",
1444
+ "을",
1445
+ "가",
1446
+ "고",
1447
+ "지",
1448
+ "서",
1449
+ "한",
1450
+ "은",
1451
+ "기",
1452
+ "으",
1453
+ "년",
1454
+ "대",
1455
+ "사",
1456
+ "시",
1457
+ "를",
1458
+ "리",
1459
+ "도",
1460
+ "인",
1461
+ "스",
1462
+ "일",
1463
+ ],
1464
+ "Indonesian": [
1465
+ "a",
1466
+ "n",
1467
+ "e",
1468
+ "i",
1469
+ "r",
1470
+ "t",
1471
+ "u",
1472
+ "s",
1473
+ "d",
1474
+ "k",
1475
+ "m",
1476
+ "l",
1477
+ "g",
1478
+ "p",
1479
+ "b",
1480
+ "o",
1481
+ "h",
1482
+ "y",
1483
+ "j",
1484
+ "c",
1485
+ "w",
1486
+ "f",
1487
+ "v",
1488
+ "z",
1489
+ "x",
1490
+ "q",
1491
+ ],
1492
+ "Turkish": [
1493
+ "a",
1494
+ "e",
1495
+ "i",
1496
+ "n",
1497
+ "r",
1498
+ "l",
1499
+ "ı",
1500
+ "k",
1501
+ "d",
1502
+ "t",
1503
+ "s",
1504
+ "m",
1505
+ "y",
1506
+ "u",
1507
+ "o",
1508
+ "b",
1509
+ "ü",
1510
+ "ş",
1511
+ "v",
1512
+ "g",
1513
+ "z",
1514
+ "h",
1515
+ "c",
1516
+ "p",
1517
+ "ç",
1518
+ "ğ",
1519
+ ],
1520
+ "Romanian": [
1521
+ "e",
1522
+ "i",
1523
+ "a",
1524
+ "r",
1525
+ "n",
1526
+ "t",
1527
+ "u",
1528
+ "l",
1529
+ "o",
1530
+ "c",
1531
+ "s",
1532
+ "d",
1533
+ "p",
1534
+ "m",
1535
+ "ă",
1536
+ "f",
1537
+ "v",
1538
+ "î",
1539
+ "g",
1540
+ "b",
1541
+ "ș",
1542
+ "ț",
1543
+ "z",
1544
+ "h",
1545
+ "â",
1546
+ "j",
1547
+ ],
1548
+ "Farsi": [
1549
+ "ا",
1550
+ "ی",
1551
+ "ر",
1552
+ "د",
1553
+ "ن",
1554
+ "ه",
1555
+ "و",
1556
+ "م",
1557
+ "ت",
1558
+ "ب",
1559
+ "س",
1560
+ "ل",
1561
+ "ک",
1562
+ "ش",
1563
+ "ز",
1564
+ "ف",
1565
+ "گ",
1566
+ "ع",
1567
+ "خ",
1568
+ "ق",
1569
+ "ج",
1570
+ "آ",
1571
+ "پ",
1572
+ "ح",
1573
+ "ط",
1574
+ "ص",
1575
+ ],
1576
+ "Arabic": [
1577
+ "ا",
1578
+ "ل",
1579
+ "ي",
1580
+ "م",
1581
+ "و",
1582
+ "ن",
1583
+ "ر",
1584
+ "ت",
1585
+ "ب",
1586
+ "ة",
1587
+ "ع",
1588
+ "د",
1589
+ "س",
1590
+ "ف",
1591
+ "ه",
1592
+ "ك",
1593
+ "ق",
1594
+ "أ",
1595
+ "ح",
1596
+ "ج",
1597
+ "ش",
1598
+ "ط",
1599
+ "ص",
1600
+ "ى",
1601
+ "خ",
1602
+ "إ",
1603
+ ],
1604
+ "Danish": [
1605
+ "e",
1606
+ "r",
1607
+ "n",
1608
+ "t",
1609
+ "a",
1610
+ "i",
1611
+ "s",
1612
+ "d",
1613
+ "l",
1614
+ "o",
1615
+ "g",
1616
+ "m",
1617
+ "k",
1618
+ "f",
1619
+ "v",
1620
+ "u",
1621
+ "b",
1622
+ "h",
1623
+ "p",
1624
+ "å",
1625
+ "y",
1626
+ "ø",
1627
+ "æ",
1628
+ "c",
1629
+ "j",
1630
+ "w",
1631
+ ],
1632
+ "Serbian": [
1633
+ "а",
1634
+ "и",
1635
+ "о",
1636
+ "е",
1637
+ "н",
1638
+ "р",
1639
+ "с",
1640
+ "у",
1641
+ "т",
1642
+ "к",
1643
+ "ј",
1644
+ "в",
1645
+ "д",
1646
+ "м",
1647
+ "п",
1648
+ "л",
1649
+ "г",
1650
+ "з",
1651
+ "б",
1652
+ "a",
1653
+ "i",
1654
+ "e",
1655
+ "o",
1656
+ "n",
1657
+ "ц",
1658
+ "ш",
1659
+ ],
1660
+ "Lithuanian": [
1661
+ "i",
1662
+ "a",
1663
+ "s",
1664
+ "o",
1665
+ "r",
1666
+ "e",
1667
+ "t",
1668
+ "n",
1669
+ "u",
1670
+ "k",
1671
+ "m",
1672
+ "l",
1673
+ "p",
1674
+ "v",
1675
+ "d",
1676
+ "j",
1677
+ "g",
1678
+ "ė",
1679
+ "b",
1680
+ "y",
1681
+ "ų",
1682
+ "š",
1683
+ "ž",
1684
+ "c",
1685
+ "ą",
1686
+ "į",
1687
+ ],
1688
+ "Slovene": [
1689
+ "e",
1690
+ "a",
1691
+ "i",
1692
+ "o",
1693
+ "n",
1694
+ "r",
1695
+ "s",
1696
+ "l",
1697
+ "t",
1698
+ "j",
1699
+ "v",
1700
+ "k",
1701
+ "d",
1702
+ "p",
1703
+ "m",
1704
+ "u",
1705
+ "z",
1706
+ "b",
1707
+ "g",
1708
+ "h",
1709
+ "č",
1710
+ "c",
1711
+ "š",
1712
+ "ž",
1713
+ "f",
1714
+ "y",
1715
+ ],
1716
+ "Slovak": [
1717
+ "o",
1718
+ "a",
1719
+ "e",
1720
+ "n",
1721
+ "i",
1722
+ "r",
1723
+ "v",
1724
+ "t",
1725
+ "s",
1726
+ "l",
1727
+ "k",
1728
+ "d",
1729
+ "m",
1730
+ "p",
1731
+ "u",
1732
+ "c",
1733
+ "h",
1734
+ "j",
1735
+ "b",
1736
+ "z",
1737
+ "á",
1738
+ "y",
1739
+ "ý",
1740
+ "í",
1741
+ "č",
1742
+ "é",
1743
+ ],
1744
+ "Hebrew": [
1745
+ "י",
1746
+ "ו",
1747
+ "ה",
1748
+ "ל",
1749
+ "ר",
1750
+ "ב",
1751
+ "ת",
1752
+ "מ",
1753
+ "א",
1754
+ "ש",
1755
+ "נ",
1756
+ "ע",
1757
+ "ם",
1758
+ "ד",
1759
+ "ק",
1760
+ "ח",
1761
+ "פ",
1762
+ "ס",
1763
+ "כ",
1764
+ "ג",
1765
+ "ט",
1766
+ "צ",
1767
+ "ן",
1768
+ "ז",
1769
+ "ך",
1770
+ ],
1771
+ "Bulgarian": [
1772
+ "а",
1773
+ "и",
1774
+ "о",
1775
+ "е",
1776
+ "н",
1777
+ "т",
1778
+ "р",
1779
+ "с",
1780
+ "в",
1781
+ "л",
1782
+ "к",
1783
+ "д",
1784
+ "п",
1785
+ "м",
1786
+ "з",
1787
+ "г",
1788
+ "я",
1789
+ "ъ",
1790
+ "у",
1791
+ "б",
1792
+ "ч",
1793
+ "ц",
1794
+ "й",
1795
+ "ж",
1796
+ "щ",
1797
+ "х",
1798
+ ],
1799
+ "Croatian": [
1800
+ "a",
1801
+ "i",
1802
+ "o",
1803
+ "e",
1804
+ "n",
1805
+ "r",
1806
+ "j",
1807
+ "s",
1808
+ "t",
1809
+ "u",
1810
+ "k",
1811
+ "l",
1812
+ "v",
1813
+ "d",
1814
+ "m",
1815
+ "p",
1816
+ "g",
1817
+ "z",
1818
+ "b",
1819
+ "c",
1820
+ "č",
1821
+ "h",
1822
+ "š",
1823
+ "ž",
1824
+ "ć",
1825
+ "f",
1826
+ ],
1827
+ "Hindi": [
1828
+ "क",
1829
+ "र",
1830
+ "स",
1831
+ "न",
1832
+ "त",
1833
+ "म",
1834
+ "ह",
1835
+ "प",
1836
+ "य",
1837
+ "ल",
1838
+ "व",
1839
+ "ज",
1840
+ "द",
1841
+ "ग",
1842
+ "ब",
1843
+ "श",
1844
+ "ट",
1845
+ "अ",
1846
+ "ए",
1847
+ "थ",
1848
+ "भ",
1849
+ "ड",
1850
+ "च",
1851
+ "ध",
1852
+ "ष",
1853
+ "इ",
1854
+ ],
1855
+ "Estonian": [
1856
+ "a",
1857
+ "i",
1858
+ "e",
1859
+ "s",
1860
+ "t",
1861
+ "l",
1862
+ "u",
1863
+ "n",
1864
+ "o",
1865
+ "k",
1866
+ "r",
1867
+ "d",
1868
+ "m",
1869
+ "v",
1870
+ "g",
1871
+ "p",
1872
+ "j",
1873
+ "h",
1874
+ "ä",
1875
+ "b",
1876
+ "õ",
1877
+ "ü",
1878
+ "f",
1879
+ "c",
1880
+ "ö",
1881
+ "y",
1882
+ ],
1883
+ "Thai": [
1884
+ "า",
1885
+ "น",
1886
+ "ร",
1887
+ "อ",
1888
+ "ก",
1889
+ "เ",
1890
+ "ง",
1891
+ "ม",
1892
+ "ย",
1893
+ "ล",
1894
+ "ว",
1895
+ "ด",
1896
+ "ท",
1897
+ "ส",
1898
+ "ต",
1899
+ "ะ",
1900
+ "ป",
1901
+ "บ",
1902
+ "ค",
1903
+ "ห",
1904
+ "แ",
1905
+ "จ",
1906
+ "พ",
1907
+ "ช",
1908
+ "ข",
1909
+ "ใ",
1910
+ ],
1911
+ "Greek": [
1912
+ "α",
1913
+ "τ",
1914
+ "ο",
1915
+ "ι",
1916
+ "ε",
1917
+ "ν",
1918
+ "ρ",
1919
+ "σ",
1920
+ "κ",
1921
+ "η",
1922
+ "π",
1923
+ "ς",
1924
+ "υ",
1925
+ "μ",
1926
+ "λ",
1927
+ "ί",
1928
+ "ό",
1929
+ "ά",
1930
+ "γ",
1931
+ "έ",
1932
+ "δ",
1933
+ "ή",
1934
+ "ω",
1935
+ "χ",
1936
+ "θ",
1937
+ "ύ",
1938
+ ],
1939
+ "Tamil": [
1940
+ "க",
1941
+ "த",
1942
+ "ப",
1943
+ "ட",
1944
+ "ர",
1945
+ "ம",
1946
+ "ல",
1947
+ "ன",
1948
+ "வ",
1949
+ "ற",
1950
+ "ய",
1951
+ "ள",
1952
+ "ச",
1953
+ "ந",
1954
+ "இ",
1955
+ "ண",
1956
+ "அ",
1957
+ "ஆ",
1958
+ "ழ",
1959
+ "ங",
1960
+ "எ",
1961
+ "உ",
1962
+ "ஒ",
1963
+ "ஸ",
1964
+ ],
1965
+ "Kazakh": [
1966
+ "а",
1967
+ "ы",
1968
+ "е",
1969
+ "н",
1970
+ "т",
1971
+ "р",
1972
+ "л",
1973
+ "і",
1974
+ "д",
1975
+ "с",
1976
+ "м",
1977
+ "қ",
1978
+ "к",
1979
+ "о",
1980
+ "б",
1981
+ "и",
1982
+ "у",
1983
+ "ғ",
1984
+ "ж",
1985
+ "ң",
1986
+ "з",
1987
+ "ш",
1988
+ "й",
1989
+ "п",
1990
+ "г",
1991
+ "ө",
1992
+ ],
1993
+ }
1994
+
1995
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
lib/python3.11/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Optional, Union
2
+ from warnings import warn
3
+
4
+ from .api import from_bytes
5
+ from .constant import CHARDET_CORRESPONDENCE
6
+
7
+
8
+ def detect(
9
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
10
+ ) -> Dict[str, Optional[Union[str, float]]]:
11
+ """
12
+ chardet legacy method
13
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
14
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
15
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
16
+ further information. Not planned for removal.
17
+
18
+ :param byte_str: The byte sequence to examine.
19
+ :param should_rename_legacy: Should we rename legacy encodings
20
+ to their more modern equivalents?
21
+ """
22
+ if len(kwargs):
23
+ warn(
24
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
25
+ )
26
+
27
+ if not isinstance(byte_str, (bytearray, bytes)):
28
+ raise TypeError( # pragma: nocover
29
+ "Expected object of type bytes or bytearray, got: "
30
+ "{0}".format(type(byte_str))
31
+ )
32
+
33
+ if isinstance(byte_str, bytearray):
34
+ byte_str = bytes(byte_str)
35
+
36
+ r = from_bytes(byte_str).best()
37
+
38
+ encoding = r.encoding if r is not None else None
39
+ language = r.language if r is not None and r.language != "Unknown" else ""
40
+ confidence = 1.0 - r.chaos if r is not None else None
41
+
42
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
43
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
44
+ if r is not None and encoding == "utf_8" and r.bom:
45
+ encoding += "_sig"
46
+
47
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
48
+ encoding = CHARDET_CORRESPONDENCE[encoding]
49
+
50
+ return {
51
+ "encoding": encoding,
52
+ "language": language,
53
+ "confidence": confidence,
54
+ }
lib/python3.11/site-packages/charset_normalizer/md.cpython-311-darwin.so ADDED
Binary file (50.1 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from logging import getLogger
3
+ from typing import List, Optional
4
+
5
+ from .constant import (
6
+ COMMON_SAFE_ASCII_CHARACTERS,
7
+ TRACE,
8
+ UNICODE_SECONDARY_RANGE_KEYWORD,
9
+ )
10
+ from .utils import (
11
+ is_accentuated,
12
+ is_arabic,
13
+ is_arabic_isolated_form,
14
+ is_case_variable,
15
+ is_cjk,
16
+ is_emoticon,
17
+ is_hangul,
18
+ is_hiragana,
19
+ is_katakana,
20
+ is_latin,
21
+ is_punctuation,
22
+ is_separator,
23
+ is_symbol,
24
+ is_thai,
25
+ is_unprintable,
26
+ remove_accent,
27
+ unicode_range,
28
+ )
29
+
30
+
31
+ class MessDetectorPlugin:
32
+ """
33
+ Base abstract class used for mess detection plugins.
34
+ All detectors MUST extend and implement given methods.
35
+ """
36
+
37
+ def eligible(self, character: str) -> bool:
38
+ """
39
+ Determine if given character should be fed in.
40
+ """
41
+ raise NotImplementedError # pragma: nocover
42
+
43
+ def feed(self, character: str) -> None:
44
+ """
45
+ The main routine to be executed upon character.
46
+ Insert the logic in witch the text would be considered chaotic.
47
+ """
48
+ raise NotImplementedError # pragma: nocover
49
+
50
+ def reset(self) -> None: # pragma: no cover
51
+ """
52
+ Permit to reset the plugin to the initial state.
53
+ """
54
+ raise NotImplementedError
55
+
56
+ @property
57
+ def ratio(self) -> float:
58
+ """
59
+ Compute the chaos ratio based on what your feed() has seen.
60
+ Must NOT be lower than 0.; No restriction gt 0.
61
+ """
62
+ raise NotImplementedError # pragma: nocover
63
+
64
+
65
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
66
+ def __init__(self) -> None:
67
+ self._punctuation_count: int = 0
68
+ self._symbol_count: int = 0
69
+ self._character_count: int = 0
70
+
71
+ self._last_printable_char: Optional[str] = None
72
+ self._frenzy_symbol_in_word: bool = False
73
+
74
+ def eligible(self, character: str) -> bool:
75
+ return character.isprintable()
76
+
77
+ def feed(self, character: str) -> None:
78
+ self._character_count += 1
79
+
80
+ if (
81
+ character != self._last_printable_char
82
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
83
+ ):
84
+ if is_punctuation(character):
85
+ self._punctuation_count += 1
86
+ elif (
87
+ character.isdigit() is False
88
+ and is_symbol(character)
89
+ and is_emoticon(character) is False
90
+ ):
91
+ self._symbol_count += 2
92
+
93
+ self._last_printable_char = character
94
+
95
+ def reset(self) -> None: # pragma: no cover
96
+ self._punctuation_count = 0
97
+ self._character_count = 0
98
+ self._symbol_count = 0
99
+
100
+ @property
101
+ def ratio(self) -> float:
102
+ if self._character_count == 0:
103
+ return 0.0
104
+
105
+ ratio_of_punctuation: float = (
106
+ self._punctuation_count + self._symbol_count
107
+ ) / self._character_count
108
+
109
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
110
+
111
+
112
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
113
+ def __init__(self) -> None:
114
+ self._character_count: int = 0
115
+ self._accentuated_count: int = 0
116
+
117
+ def eligible(self, character: str) -> bool:
118
+ return character.isalpha()
119
+
120
+ def feed(self, character: str) -> None:
121
+ self._character_count += 1
122
+
123
+ if is_accentuated(character):
124
+ self._accentuated_count += 1
125
+
126
+ def reset(self) -> None: # pragma: no cover
127
+ self._character_count = 0
128
+ self._accentuated_count = 0
129
+
130
+ @property
131
+ def ratio(self) -> float:
132
+ if self._character_count < 8:
133
+ return 0.0
134
+
135
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
136
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
137
+
138
+
139
+ class UnprintablePlugin(MessDetectorPlugin):
140
+ def __init__(self) -> None:
141
+ self._unprintable_count: int = 0
142
+ self._character_count: int = 0
143
+
144
+ def eligible(self, character: str) -> bool:
145
+ return True
146
+
147
+ def feed(self, character: str) -> None:
148
+ if is_unprintable(character):
149
+ self._unprintable_count += 1
150
+ self._character_count += 1
151
+
152
+ def reset(self) -> None: # pragma: no cover
153
+ self._unprintable_count = 0
154
+
155
+ @property
156
+ def ratio(self) -> float:
157
+ if self._character_count == 0:
158
+ return 0.0
159
+
160
+ return (self._unprintable_count * 8) / self._character_count
161
+
162
+
163
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
164
+ def __init__(self) -> None:
165
+ self._successive_count: int = 0
166
+ self._character_count: int = 0
167
+
168
+ self._last_latin_character: Optional[str] = None
169
+
170
+ def eligible(self, character: str) -> bool:
171
+ return character.isalpha() and is_latin(character)
172
+
173
+ def feed(self, character: str) -> None:
174
+ self._character_count += 1
175
+ if (
176
+ self._last_latin_character is not None
177
+ and is_accentuated(character)
178
+ and is_accentuated(self._last_latin_character)
179
+ ):
180
+ if character.isupper() and self._last_latin_character.isupper():
181
+ self._successive_count += 1
182
+ # Worse if its the same char duplicated with different accent.
183
+ if remove_accent(character) == remove_accent(self._last_latin_character):
184
+ self._successive_count += 1
185
+ self._last_latin_character = character
186
+
187
+ def reset(self) -> None: # pragma: no cover
188
+ self._successive_count = 0
189
+ self._character_count = 0
190
+ self._last_latin_character = None
191
+
192
+ @property
193
+ def ratio(self) -> float:
194
+ if self._character_count == 0:
195
+ return 0.0
196
+
197
+ return (self._successive_count * 2) / self._character_count
198
+
199
+
200
+ class SuspiciousRange(MessDetectorPlugin):
201
+ def __init__(self) -> None:
202
+ self._suspicious_successive_range_count: int = 0
203
+ self._character_count: int = 0
204
+ self._last_printable_seen: Optional[str] = None
205
+
206
+ def eligible(self, character: str) -> bool:
207
+ return character.isprintable()
208
+
209
+ def feed(self, character: str) -> None:
210
+ self._character_count += 1
211
+
212
+ if (
213
+ character.isspace()
214
+ or is_punctuation(character)
215
+ or character in COMMON_SAFE_ASCII_CHARACTERS
216
+ ):
217
+ self._last_printable_seen = None
218
+ return
219
+
220
+ if self._last_printable_seen is None:
221
+ self._last_printable_seen = character
222
+ return
223
+
224
+ unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
225
+ unicode_range_b: Optional[str] = unicode_range(character)
226
+
227
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
228
+ self._suspicious_successive_range_count += 1
229
+
230
+ self._last_printable_seen = character
231
+
232
+ def reset(self) -> None: # pragma: no cover
233
+ self._character_count = 0
234
+ self._suspicious_successive_range_count = 0
235
+ self._last_printable_seen = None
236
+
237
+ @property
238
+ def ratio(self) -> float:
239
+ if self._character_count <= 24:
240
+ return 0.0
241
+
242
+ ratio_of_suspicious_range_usage: float = (
243
+ self._suspicious_successive_range_count * 2
244
+ ) / self._character_count
245
+
246
+ return ratio_of_suspicious_range_usage
247
+
248
+
249
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
250
+ def __init__(self) -> None:
251
+ self._word_count: int = 0
252
+ self._bad_word_count: int = 0
253
+ self._foreign_long_count: int = 0
254
+
255
+ self._is_current_word_bad: bool = False
256
+ self._foreign_long_watch: bool = False
257
+
258
+ self._character_count: int = 0
259
+ self._bad_character_count: int = 0
260
+
261
+ self._buffer: str = ""
262
+ self._buffer_accent_count: int = 0
263
+
264
+ def eligible(self, character: str) -> bool:
265
+ return True
266
+
267
+ def feed(self, character: str) -> None:
268
+ if character.isalpha():
269
+ self._buffer += character
270
+ if is_accentuated(character):
271
+ self._buffer_accent_count += 1
272
+ if (
273
+ self._foreign_long_watch is False
274
+ and (is_latin(character) is False or is_accentuated(character))
275
+ and is_cjk(character) is False
276
+ and is_hangul(character) is False
277
+ and is_katakana(character) is False
278
+ and is_hiragana(character) is False
279
+ and is_thai(character) is False
280
+ ):
281
+ self._foreign_long_watch = True
282
+ return
283
+ if not self._buffer:
284
+ return
285
+ if (
286
+ character.isspace() or is_punctuation(character) or is_separator(character)
287
+ ) and self._buffer:
288
+ self._word_count += 1
289
+ buffer_length: int = len(self._buffer)
290
+
291
+ self._character_count += buffer_length
292
+
293
+ if buffer_length >= 4:
294
+ if self._buffer_accent_count / buffer_length > 0.34:
295
+ self._is_current_word_bad = True
296
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
297
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
298
+ if (
299
+ is_accentuated(self._buffer[-1])
300
+ and self._buffer[-1].isupper()
301
+ and all(_.isupper() for _ in self._buffer) is False
302
+ ):
303
+ self._foreign_long_count += 1
304
+ self._is_current_word_bad = True
305
+ if buffer_length >= 24 and self._foreign_long_watch:
306
+ camel_case_dst = [
307
+ i
308
+ for c, i in zip(self._buffer, range(0, buffer_length))
309
+ if c.isupper()
310
+ ]
311
+ probable_camel_cased: bool = False
312
+
313
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
314
+ probable_camel_cased = True
315
+
316
+ if not probable_camel_cased:
317
+ self._foreign_long_count += 1
318
+ self._is_current_word_bad = True
319
+
320
+ if self._is_current_word_bad:
321
+ self._bad_word_count += 1
322
+ self._bad_character_count += len(self._buffer)
323
+ self._is_current_word_bad = False
324
+
325
+ self._foreign_long_watch = False
326
+ self._buffer = ""
327
+ self._buffer_accent_count = 0
328
+ elif (
329
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
330
+ and character.isdigit() is False
331
+ and is_symbol(character)
332
+ ):
333
+ self._is_current_word_bad = True
334
+ self._buffer += character
335
+
336
+ def reset(self) -> None: # pragma: no cover
337
+ self._buffer = ""
338
+ self._is_current_word_bad = False
339
+ self._foreign_long_watch = False
340
+ self._bad_word_count = 0
341
+ self._word_count = 0
342
+ self._character_count = 0
343
+ self._bad_character_count = 0
344
+ self._foreign_long_count = 0
345
+
346
+ @property
347
+ def ratio(self) -> float:
348
+ if self._word_count <= 10 and self._foreign_long_count == 0:
349
+ return 0.0
350
+
351
+ return self._bad_character_count / self._character_count
352
+
353
+
354
+ class CjkInvalidStopPlugin(MessDetectorPlugin):
355
+ """
356
+ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
357
+ can be easily detected. Searching for the overuse of '丅' and '丄'.
358
+ """
359
+
360
+ def __init__(self) -> None:
361
+ self._wrong_stop_count: int = 0
362
+ self._cjk_character_count: int = 0
363
+
364
+ def eligible(self, character: str) -> bool:
365
+ return True
366
+
367
+ def feed(self, character: str) -> None:
368
+ if character in {"丅", "丄"}:
369
+ self._wrong_stop_count += 1
370
+ return
371
+ if is_cjk(character):
372
+ self._cjk_character_count += 1
373
+
374
+ def reset(self) -> None: # pragma: no cover
375
+ self._wrong_stop_count = 0
376
+ self._cjk_character_count = 0
377
+
378
+ @property
379
+ def ratio(self) -> float:
380
+ if self._cjk_character_count < 16:
381
+ return 0.0
382
+ return self._wrong_stop_count / self._cjk_character_count
383
+
384
+
385
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
386
+ def __init__(self) -> None:
387
+ self._buf: bool = False
388
+
389
+ self._character_count_since_last_sep: int = 0
390
+
391
+ self._successive_upper_lower_count: int = 0
392
+ self._successive_upper_lower_count_final: int = 0
393
+
394
+ self._character_count: int = 0
395
+
396
+ self._last_alpha_seen: Optional[str] = None
397
+ self._current_ascii_only: bool = True
398
+
399
+ def eligible(self, character: str) -> bool:
400
+ return True
401
+
402
+ def feed(self, character: str) -> None:
403
+ is_concerned = character.isalpha() and is_case_variable(character)
404
+ chunk_sep = is_concerned is False
405
+
406
+ if chunk_sep and self._character_count_since_last_sep > 0:
407
+ if (
408
+ self._character_count_since_last_sep <= 64
409
+ and character.isdigit() is False
410
+ and self._current_ascii_only is False
411
+ ):
412
+ self._successive_upper_lower_count_final += (
413
+ self._successive_upper_lower_count
414
+ )
415
+
416
+ self._successive_upper_lower_count = 0
417
+ self._character_count_since_last_sep = 0
418
+ self._last_alpha_seen = None
419
+ self._buf = False
420
+ self._character_count += 1
421
+ self._current_ascii_only = True
422
+
423
+ return
424
+
425
+ if self._current_ascii_only is True and character.isascii() is False:
426
+ self._current_ascii_only = False
427
+
428
+ if self._last_alpha_seen is not None:
429
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
430
+ character.islower() and self._last_alpha_seen.isupper()
431
+ ):
432
+ if self._buf is True:
433
+ self._successive_upper_lower_count += 2
434
+ self._buf = False
435
+ else:
436
+ self._buf = True
437
+ else:
438
+ self._buf = False
439
+
440
+ self._character_count += 1
441
+ self._character_count_since_last_sep += 1
442
+ self._last_alpha_seen = character
443
+
444
+ def reset(self) -> None: # pragma: no cover
445
+ self._character_count = 0
446
+ self._character_count_since_last_sep = 0
447
+ self._successive_upper_lower_count = 0
448
+ self._successive_upper_lower_count_final = 0
449
+ self._last_alpha_seen = None
450
+ self._buf = False
451
+ self._current_ascii_only = True
452
+
453
+ @property
454
+ def ratio(self) -> float:
455
+ if self._character_count == 0:
456
+ return 0.0
457
+
458
+ return self._successive_upper_lower_count_final / self._character_count
459
+
460
+
461
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
462
+ def __init__(self) -> None:
463
+ self._character_count: int = 0
464
+ self._isolated_form_count: int = 0
465
+
466
+ def reset(self) -> None: # pragma: no cover
467
+ self._character_count = 0
468
+ self._isolated_form_count = 0
469
+
470
+ def eligible(self, character: str) -> bool:
471
+ return is_arabic(character)
472
+
473
+ def feed(self, character: str) -> None:
474
+ self._character_count += 1
475
+
476
+ if is_arabic_isolated_form(character):
477
+ self._isolated_form_count += 1
478
+
479
+ @property
480
+ def ratio(self) -> float:
481
+ if self._character_count < 8:
482
+ return 0.0
483
+
484
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
485
+
486
+ return isolated_form_usage
487
+
488
+
489
+ @lru_cache(maxsize=1024)
490
+ def is_suspiciously_successive_range(
491
+ unicode_range_a: Optional[str], unicode_range_b: Optional[str]
492
+ ) -> bool:
493
+ """
494
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
495
+ """
496
+ if unicode_range_a is None or unicode_range_b is None:
497
+ return True
498
+
499
+ if unicode_range_a == unicode_range_b:
500
+ return False
501
+
502
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
503
+ return False
504
+
505
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
506
+ return False
507
+
508
+ # Latin characters can be accompanied with a combining diacritical mark
509
+ # eg. Vietnamese.
510
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
511
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
512
+ ):
513
+ return False
514
+
515
+ keywords_range_a, keywords_range_b = unicode_range_a.split(
516
+ " "
517
+ ), unicode_range_b.split(" ")
518
+
519
+ for el in keywords_range_a:
520
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
521
+ continue
522
+ if el in keywords_range_b:
523
+ return False
524
+
525
+ # Japanese Exception
526
+ range_a_jp_chars, range_b_jp_chars = (
527
+ unicode_range_a
528
+ in (
529
+ "Hiragana",
530
+ "Katakana",
531
+ ),
532
+ unicode_range_b in ("Hiragana", "Katakana"),
533
+ )
534
+ if (range_a_jp_chars or range_b_jp_chars) and (
535
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
536
+ ):
537
+ return False
538
+ if range_a_jp_chars and range_b_jp_chars:
539
+ return False
540
+
541
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
542
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
543
+ return False
544
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
545
+ return False
546
+
547
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
548
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
549
+ unicode_range_a in ["Katakana", "Hiragana"]
550
+ and unicode_range_b in ["Katakana", "Hiragana"]
551
+ ):
552
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
553
+ return False
554
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
555
+ return False
556
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
557
+ return False
558
+
559
+ return True
560
+
561
+
562
+ @lru_cache(maxsize=2048)
563
+ def mess_ratio(
564
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
565
+ ) -> float:
566
+ """
567
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
568
+ """
569
+
570
+ detectors: List[MessDetectorPlugin] = [
571
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
572
+ ]
573
+
574
+ length: int = len(decoded_sequence) + 1
575
+
576
+ mean_mess_ratio: float = 0.0
577
+
578
+ if length < 512:
579
+ intermediary_mean_mess_ratio_calc: int = 32
580
+ elif length <= 1024:
581
+ intermediary_mean_mess_ratio_calc = 64
582
+ else:
583
+ intermediary_mean_mess_ratio_calc = 128
584
+
585
+ for character, index in zip(decoded_sequence + "\n", range(length)):
586
+ for detector in detectors:
587
+ if detector.eligible(character):
588
+ detector.feed(character)
589
+
590
+ if (
591
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
592
+ ) or index == length - 1:
593
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
594
+
595
+ if mean_mess_ratio >= maximum_threshold:
596
+ break
597
+
598
+ if debug:
599
+ logger = getLogger("charset_normalizer")
600
+
601
+ logger.log(
602
+ TRACE,
603
+ "Mess-detector extended-analysis start. "
604
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
605
+ f"maximum_threshold={maximum_threshold}",
606
+ )
607
+
608
+ if len(decoded_sequence) > 16:
609
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
610
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
611
+
612
+ for dt in detectors: # pragma: nocover
613
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
614
+
615
+ return round(mean_mess_ratio, 3)
lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-darwin.so ADDED
Binary file (233 kB). View file
 
lib/python3.11/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from encodings.aliases import aliases
2
+ from hashlib import sha256
3
+ from json import dumps
4
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from .constant import TOO_BIG_SEQUENCE
7
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
8
+
9
+
10
+ class CharsetMatch:
11
+ def __init__(
12
+ self,
13
+ payload: bytes,
14
+ guessed_encoding: str,
15
+ mean_mess_ratio: float,
16
+ has_sig_or_bom: bool,
17
+ languages: "CoherenceMatches",
18
+ decoded_payload: Optional[str] = None,
19
+ ):
20
+ self._payload: bytes = payload
21
+
22
+ self._encoding: str = guessed_encoding
23
+ self._mean_mess_ratio: float = mean_mess_ratio
24
+ self._languages: CoherenceMatches = languages
25
+ self._has_sig_or_bom: bool = has_sig_or_bom
26
+ self._unicode_ranges: Optional[List[str]] = None
27
+
28
+ self._leaves: List[CharsetMatch] = []
29
+ self._mean_coherence_ratio: float = 0.0
30
+
31
+ self._output_payload: Optional[bytes] = None
32
+ self._output_encoding: Optional[str] = None
33
+
34
+ self._string: Optional[str] = decoded_payload
35
+
36
+ def __eq__(self, other: object) -> bool:
37
+ if not isinstance(other, CharsetMatch):
38
+ raise TypeError(
39
+ "__eq__ cannot be invoked on {} and {}.".format(
40
+ str(other.__class__), str(self.__class__)
41
+ )
42
+ )
43
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
44
+
45
+ def __lt__(self, other: object) -> bool:
46
+ """
47
+ Implemented to make sorted available upon CharsetMatches items.
48
+ """
49
+ if not isinstance(other, CharsetMatch):
50
+ raise ValueError
51
+
52
+ chaos_difference: float = abs(self.chaos - other.chaos)
53
+ coherence_difference: float = abs(self.coherence - other.coherence)
54
+
55
+ # Below 1% difference --> Use Coherence
56
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
57
+ return self.coherence > other.coherence
58
+ elif chaos_difference < 0.01 and coherence_difference <= 0.02:
59
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
60
+ # preserve RAM usage!
61
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
62
+ return self.chaos < other.chaos
63
+ return self.multi_byte_usage > other.multi_byte_usage
64
+
65
+ return self.chaos < other.chaos
66
+
67
+ @property
68
+ def multi_byte_usage(self) -> float:
69
+ return 1.0 - (len(str(self)) / len(self.raw))
70
+
71
+ def __str__(self) -> str:
72
+ # Lazy Str Loading
73
+ if self._string is None:
74
+ self._string = str(self._payload, self._encoding, "strict")
75
+ return self._string
76
+
77
+ def __repr__(self) -> str:
78
+ return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
79
+
80
+ def add_submatch(self, other: "CharsetMatch") -> None:
81
+ if not isinstance(other, CharsetMatch) or other == self:
82
+ raise ValueError(
83
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
84
+ other.__class__
85
+ )
86
+ )
87
+
88
+ other._string = None # Unload RAM usage; dirty trick.
89
+ self._leaves.append(other)
90
+
91
+ @property
92
+ def encoding(self) -> str:
93
+ return self._encoding
94
+
95
+ @property
96
+ def encoding_aliases(self) -> List[str]:
97
+ """
98
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
99
+ """
100
+ also_known_as: List[str] = []
101
+ for u, p in aliases.items():
102
+ if self.encoding == u:
103
+ also_known_as.append(p)
104
+ elif self.encoding == p:
105
+ also_known_as.append(u)
106
+ return also_known_as
107
+
108
+ @property
109
+ def bom(self) -> bool:
110
+ return self._has_sig_or_bom
111
+
112
+ @property
113
+ def byte_order_mark(self) -> bool:
114
+ return self._has_sig_or_bom
115
+
116
+ @property
117
+ def languages(self) -> List[str]:
118
+ """
119
+ Return the complete list of possible languages found in decoded sequence.
120
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
121
+ """
122
+ return [e[0] for e in self._languages]
123
+
124
+ @property
125
+ def language(self) -> str:
126
+ """
127
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
128
+ "Unknown".
129
+ """
130
+ if not self._languages:
131
+ # Trying to infer the language based on the given encoding
132
+ # Its either English or we should not pronounce ourselves in certain cases.
133
+ if "ascii" in self.could_be_from_charset:
134
+ return "English"
135
+
136
+ # doing it there to avoid circular import
137
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
138
+
139
+ languages = (
140
+ mb_encoding_languages(self.encoding)
141
+ if is_multi_byte_encoding(self.encoding)
142
+ else encoding_languages(self.encoding)
143
+ )
144
+
145
+ if len(languages) == 0 or "Latin Based" in languages:
146
+ return "Unknown"
147
+
148
+ return languages[0]
149
+
150
+ return self._languages[0][0]
151
+
152
+ @property
153
+ def chaos(self) -> float:
154
+ return self._mean_mess_ratio
155
+
156
+ @property
157
+ def coherence(self) -> float:
158
+ if not self._languages:
159
+ return 0.0
160
+ return self._languages[0][1]
161
+
162
+ @property
163
+ def percent_chaos(self) -> float:
164
+ return round(self.chaos * 100, ndigits=3)
165
+
166
+ @property
167
+ def percent_coherence(self) -> float:
168
+ return round(self.coherence * 100, ndigits=3)
169
+
170
+ @property
171
+ def raw(self) -> bytes:
172
+ """
173
+ Original untouched bytes.
174
+ """
175
+ return self._payload
176
+
177
+ @property
178
+ def submatch(self) -> List["CharsetMatch"]:
179
+ return self._leaves
180
+
181
+ @property
182
+ def has_submatch(self) -> bool:
183
+ return len(self._leaves) > 0
184
+
185
+ @property
186
+ def alphabets(self) -> List[str]:
187
+ if self._unicode_ranges is not None:
188
+ return self._unicode_ranges
189
+ # list detected ranges
190
+ detected_ranges: List[Optional[str]] = [
191
+ unicode_range(char) for char in str(self)
192
+ ]
193
+ # filter and sort
194
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
195
+ return self._unicode_ranges
196
+
197
+ @property
198
+ def could_be_from_charset(self) -> List[str]:
199
+ """
200
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
201
+ encoding.
202
+ This list does include the encoding available in property 'encoding'.
203
+ """
204
+ return [self._encoding] + [m.encoding for m in self._leaves]
205
+
206
+ def output(self, encoding: str = "utf_8") -> bytes:
207
+ """
208
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
209
+ Any errors will be simply ignored by the encoder NOT replaced.
210
+ """
211
+ if self._output_encoding is None or self._output_encoding != encoding:
212
+ self._output_encoding = encoding
213
+ self._output_payload = str(self).encode(encoding, "replace")
214
+
215
+ return self._output_payload # type: ignore
216
+
217
+ @property
218
+ def fingerprint(self) -> str:
219
+ """
220
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
221
+ """
222
+ return sha256(self.output()).hexdigest()
223
+
224
+
225
+ class CharsetMatches:
226
+ """
227
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
228
+ Act like a list(iterable) but does not implements all related methods.
229
+ """
230
+
231
+ def __init__(self, results: Optional[List[CharsetMatch]] = None):
232
+ self._results: List[CharsetMatch] = sorted(results) if results else []
233
+
234
+ def __iter__(self) -> Iterator[CharsetMatch]:
235
+ yield from self._results
236
+
237
+ def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
238
+ """
239
+ Retrieve a single item either by its position or encoding name (alias may be used here).
240
+ Raise KeyError upon invalid index or encoding not present in results.
241
+ """
242
+ if isinstance(item, int):
243
+ return self._results[item]
244
+ if isinstance(item, str):
245
+ item = iana_name(item, False)
246
+ for result in self._results:
247
+ if item in result.could_be_from_charset:
248
+ return result
249
+ raise KeyError
250
+
251
+ def __len__(self) -> int:
252
+ return len(self._results)
253
+
254
+ def __bool__(self) -> bool:
255
+ return len(self._results) > 0
256
+
257
+ def append(self, item: CharsetMatch) -> None:
258
+ """
259
+ Insert a single match. Will be inserted accordingly to preserve sort.
260
+ Can be inserted as a submatch.
261
+ """
262
+ if not isinstance(item, CharsetMatch):
263
+ raise ValueError(
264
+ "Cannot append instance '{}' to CharsetMatches".format(
265
+ str(item.__class__)
266
+ )
267
+ )
268
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
269
+ if len(item.raw) <= TOO_BIG_SEQUENCE:
270
+ for match in self._results:
271
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
272
+ match.add_submatch(item)
273
+ return
274
+ self._results.append(item)
275
+ self._results = sorted(self._results)
276
+
277
+ def best(self) -> Optional["CharsetMatch"]:
278
+ """
279
+ Simply return the first match. Strict equivalent to matches[0].
280
+ """
281
+ if not self._results:
282
+ return None
283
+ return self._results[0]
284
+
285
+ def first(self) -> Optional["CharsetMatch"]:
286
+ """
287
+ Redundant method, call the method best(). Kept for BC reasons.
288
+ """
289
+ return self.best()
290
+
291
+
292
+ CoherenceMatch = Tuple[str, float]
293
+ CoherenceMatches = List[CoherenceMatch]
294
+
295
+
296
+ class CliDetectionResult:
297
+ def __init__(
298
+ self,
299
+ path: str,
300
+ encoding: Optional[str],
301
+ encoding_aliases: List[str],
302
+ alternative_encodings: List[str],
303
+ language: str,
304
+ alphabets: List[str],
305
+ has_sig_or_bom: bool,
306
+ chaos: float,
307
+ coherence: float,
308
+ unicode_path: Optional[str],
309
+ is_preferred: bool,
310
+ ):
311
+ self.path: str = path
312
+ self.unicode_path: Optional[str] = unicode_path
313
+ self.encoding: Optional[str] = encoding
314
+ self.encoding_aliases: List[str] = encoding_aliases
315
+ self.alternative_encodings: List[str] = alternative_encodings
316
+ self.language: str = language
317
+ self.alphabets: List[str] = alphabets
318
+ self.has_sig_or_bom: bool = has_sig_or_bom
319
+ self.chaos: float = chaos
320
+ self.coherence: float = coherence
321
+ self.is_preferred: bool = is_preferred
322
+
323
+ @property
324
+ def __dict__(self) -> Dict[str, Any]: # type: ignore
325
+ return {
326
+ "path": self.path,
327
+ "encoding": self.encoding,
328
+ "encoding_aliases": self.encoding_aliases,
329
+ "alternative_encodings": self.alternative_encodings,
330
+ "language": self.language,
331
+ "alphabets": self.alphabets,
332
+ "has_sig_or_bom": self.has_sig_or_bom,
333
+ "chaos": self.chaos,
334
+ "coherence": self.coherence,
335
+ "unicode_path": self.unicode_path,
336
+ "is_preferred": self.is_preferred,
337
+ }
338
+
339
+ def to_json(self) -> str:
340
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
lib/python3.11/site-packages/charset_normalizer/py.typed ADDED
File without changes
lib/python3.11/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import logging
3
+ import unicodedata
4
+ from codecs import IncrementalDecoder
5
+ from encodings.aliases import aliases
6
+ from functools import lru_cache
7
+ from re import findall
8
+ from typing import Generator, List, Optional, Set, Tuple, Union
9
+
10
+ from _multibytecodec import MultibyteIncrementalDecoder
11
+
12
+ from .constant import (
13
+ ENCODING_MARKS,
14
+ IANA_SUPPORTED_SIMILAR,
15
+ RE_POSSIBLE_ENCODING_INDICATION,
16
+ UNICODE_RANGES_COMBINED,
17
+ UNICODE_SECONDARY_RANGE_KEYWORD,
18
+ UTF8_MAXIMAL_ALLOCATION,
19
+ )
20
+
21
+
22
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
23
+ def is_accentuated(character: str) -> bool:
24
+ try:
25
+ description: str = unicodedata.name(character)
26
+ except ValueError:
27
+ return False
28
+ return (
29
+ "WITH GRAVE" in description
30
+ or "WITH ACUTE" in description
31
+ or "WITH CEDILLA" in description
32
+ or "WITH DIAERESIS" in description
33
+ or "WITH CIRCUMFLEX" in description
34
+ or "WITH TILDE" in description
35
+ or "WITH MACRON" in description
36
+ or "WITH RING ABOVE" in description
37
+ )
38
+
39
+
40
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
41
+ def remove_accent(character: str) -> str:
42
+ decomposed: str = unicodedata.decomposition(character)
43
+ if not decomposed:
44
+ return character
45
+
46
+ codes: List[str] = decomposed.split(" ")
47
+
48
+ return chr(int(codes[0], 16))
49
+
50
+
51
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
52
+ def unicode_range(character: str) -> Optional[str]:
53
+ """
54
+ Retrieve the Unicode range official name from a single character.
55
+ """
56
+ character_ord: int = ord(character)
57
+
58
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
59
+ if character_ord in ord_range:
60
+ return range_name
61
+
62
+ return None
63
+
64
+
65
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
66
+ def is_latin(character: str) -> bool:
67
+ try:
68
+ description: str = unicodedata.name(character)
69
+ except ValueError:
70
+ return False
71
+ return "LATIN" in description
72
+
73
+
74
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
75
+ def is_punctuation(character: str) -> bool:
76
+ character_category: str = unicodedata.category(character)
77
+
78
+ if "P" in character_category:
79
+ return True
80
+
81
+ character_range: Optional[str] = unicode_range(character)
82
+
83
+ if character_range is None:
84
+ return False
85
+
86
+ return "Punctuation" in character_range
87
+
88
+
89
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
90
+ def is_symbol(character: str) -> bool:
91
+ character_category: str = unicodedata.category(character)
92
+
93
+ if "S" in character_category or "N" in character_category:
94
+ return True
95
+
96
+ character_range: Optional[str] = unicode_range(character)
97
+
98
+ if character_range is None:
99
+ return False
100
+
101
+ return "Forms" in character_range and character_category != "Lo"
102
+
103
+
104
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
105
+ def is_emoticon(character: str) -> bool:
106
+ character_range: Optional[str] = unicode_range(character)
107
+
108
+ if character_range is None:
109
+ return False
110
+
111
+ return "Emoticons" in character_range or "Pictographs" in character_range
112
+
113
+
114
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
115
+ def is_separator(character: str) -> bool:
116
+ if character.isspace() or character in {"|", "+", "<", ">"}:
117
+ return True
118
+
119
+ character_category: str = unicodedata.category(character)
120
+
121
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
122
+
123
+
124
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
125
+ def is_case_variable(character: str) -> bool:
126
+ return character.islower() != character.isupper()
127
+
128
+
129
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
130
+ def is_cjk(character: str) -> bool:
131
+ try:
132
+ character_name = unicodedata.name(character)
133
+ except ValueError:
134
+ return False
135
+
136
+ return "CJK" in character_name
137
+
138
+
139
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
140
+ def is_hiragana(character: str) -> bool:
141
+ try:
142
+ character_name = unicodedata.name(character)
143
+ except ValueError:
144
+ return False
145
+
146
+ return "HIRAGANA" in character_name
147
+
148
+
149
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
150
+ def is_katakana(character: str) -> bool:
151
+ try:
152
+ character_name = unicodedata.name(character)
153
+ except ValueError:
154
+ return False
155
+
156
+ return "KATAKANA" in character_name
157
+
158
+
159
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
160
+ def is_hangul(character: str) -> bool:
161
+ try:
162
+ character_name = unicodedata.name(character)
163
+ except ValueError:
164
+ return False
165
+
166
+ return "HANGUL" in character_name
167
+
168
+
169
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
170
+ def is_thai(character: str) -> bool:
171
+ try:
172
+ character_name = unicodedata.name(character)
173
+ except ValueError:
174
+ return False
175
+
176
+ return "THAI" in character_name
177
+
178
+
179
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
180
+ def is_arabic(character: str) -> bool:
181
+ try:
182
+ character_name = unicodedata.name(character)
183
+ except ValueError:
184
+ return False
185
+
186
+ return "ARABIC" in character_name
187
+
188
+
189
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
190
+ def is_arabic_isolated_form(character: str) -> bool:
191
+ try:
192
+ character_name = unicodedata.name(character)
193
+ except ValueError:
194
+ return False
195
+
196
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
197
+
198
+
199
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
200
+ def is_unicode_range_secondary(range_name: str) -> bool:
201
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
202
+
203
+
204
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
205
+ def is_unprintable(character: str) -> bool:
206
+ return (
207
+ character.isspace() is False # includes \n \t \r \v
208
+ and character.isprintable() is False
209
+ and character != "\x1A" # Why? Its the ASCII substitute character.
210
+ and character != "\ufeff" # bug discovered in Python,
211
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
212
+ )
213
+
214
+
215
+ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
216
+ """
217
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
218
+ """
219
+ if not isinstance(sequence, bytes):
220
+ raise TypeError
221
+
222
+ seq_len: int = len(sequence)
223
+
224
+ results: List[str] = findall(
225
+ RE_POSSIBLE_ENCODING_INDICATION,
226
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
227
+ )
228
+
229
+ if len(results) == 0:
230
+ return None
231
+
232
+ for specified_encoding in results:
233
+ specified_encoding = specified_encoding.lower().replace("-", "_")
234
+
235
+ encoding_alias: str
236
+ encoding_iana: str
237
+
238
+ for encoding_alias, encoding_iana in aliases.items():
239
+ if encoding_alias == specified_encoding:
240
+ return encoding_iana
241
+ if encoding_iana == specified_encoding:
242
+ return encoding_iana
243
+
244
+ return None
245
+
246
+
247
+ @lru_cache(maxsize=128)
248
+ def is_multi_byte_encoding(name: str) -> bool:
249
+ """
250
+ Verify is a specific encoding is a multi byte one based on it IANA name
251
+ """
252
+ return name in {
253
+ "utf_8",
254
+ "utf_8_sig",
255
+ "utf_16",
256
+ "utf_16_be",
257
+ "utf_16_le",
258
+ "utf_32",
259
+ "utf_32_le",
260
+ "utf_32_be",
261
+ "utf_7",
262
+ } or issubclass(
263
+ importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
264
+ MultibyteIncrementalDecoder,
265
+ )
266
+
267
+
268
+ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
269
+ """
270
+ Identify and extract SIG/BOM in given sequence.
271
+ """
272
+
273
+ for iana_encoding in ENCODING_MARKS:
274
+ marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
275
+
276
+ if isinstance(marks, bytes):
277
+ marks = [marks]
278
+
279
+ for mark in marks:
280
+ if sequence.startswith(mark):
281
+ return iana_encoding, mark
282
+
283
+ return None, b""
284
+
285
+
286
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
287
+ return iana_encoding not in {"utf_16", "utf_32"}
288
+
289
+
290
+ def iana_name(cp_name: str, strict: bool = True) -> str:
291
+ cp_name = cp_name.lower().replace("-", "_")
292
+
293
+ encoding_alias: str
294
+ encoding_iana: str
295
+
296
+ for encoding_alias, encoding_iana in aliases.items():
297
+ if cp_name in [encoding_alias, encoding_iana]:
298
+ return encoding_iana
299
+
300
+ if strict:
301
+ raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
302
+
303
+ return cp_name
304
+
305
+
306
+ def range_scan(decoded_sequence: str) -> List[str]:
307
+ ranges: Set[str] = set()
308
+
309
+ for character in decoded_sequence:
310
+ character_range: Optional[str] = unicode_range(character)
311
+
312
+ if character_range is None:
313
+ continue
314
+
315
+ ranges.add(character_range)
316
+
317
+ return list(ranges)
318
+
319
+
320
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
321
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
322
+ return 0.0
323
+
324
+ decoder_a = importlib.import_module(
325
+ "encodings.{}".format(iana_name_a)
326
+ ).IncrementalDecoder
327
+ decoder_b = importlib.import_module(
328
+ "encodings.{}".format(iana_name_b)
329
+ ).IncrementalDecoder
330
+
331
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
332
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
333
+
334
+ character_match_count: int = 0
335
+
336
+ for i in range(255):
337
+ to_be_decoded: bytes = bytes([i])
338
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
339
+ character_match_count += 1
340
+
341
+ return character_match_count / 254
342
+
343
+
344
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
345
+ """
346
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
347
+ the function cp_similarity.
348
+ """
349
+ return (
350
+ iana_name_a in IANA_SUPPORTED_SIMILAR
351
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
352
+ )
353
+
354
+
355
+ def set_logging_handler(
356
+ name: str = "charset_normalizer",
357
+ level: int = logging.INFO,
358
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
359
+ ) -> None:
360
+ logger = logging.getLogger(name)
361
+ logger.setLevel(level)
362
+
363
+ handler = logging.StreamHandler()
364
+ handler.setFormatter(logging.Formatter(format_string))
365
+ logger.addHandler(handler)
366
+
367
+
368
+ def cut_sequence_chunks(
369
+ sequences: bytes,
370
+ encoding_iana: str,
371
+ offsets: range,
372
+ chunk_size: int,
373
+ bom_or_sig_available: bool,
374
+ strip_sig_or_bom: bool,
375
+ sig_payload: bytes,
376
+ is_multi_byte_decoder: bool,
377
+ decoded_payload: Optional[str] = None,
378
+ ) -> Generator[str, None, None]:
379
+ if decoded_payload and is_multi_byte_decoder is False:
380
+ for i in offsets:
381
+ chunk = decoded_payload[i : i + chunk_size]
382
+ if not chunk:
383
+ break
384
+ yield chunk
385
+ else:
386
+ for i in offsets:
387
+ chunk_end = i + chunk_size
388
+ if chunk_end > len(sequences) + 8:
389
+ continue
390
+
391
+ cut_sequence = sequences[i : i + chunk_size]
392
+
393
+ if bom_or_sig_available and strip_sig_or_bom is False:
394
+ cut_sequence = sig_payload + cut_sequence
395
+
396
+ chunk = cut_sequence.decode(
397
+ encoding_iana,
398
+ errors="ignore" if is_multi_byte_decoder else "strict",
399
+ )
400
+
401
+ # multi-byte bad cutting detector and adjustment
402
+ # not the cleanest way to perform that fix but clever enough for now.
403
+ if is_multi_byte_decoder and i > 0:
404
+ chunk_partial_size_chk: int = min(chunk_size, 16)
405
+
406
+ if (
407
+ decoded_payload
408
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
409
+ ):
410
+ for j in range(i, i - 4, -1):
411
+ cut_sequence = sequences[j:chunk_end]
412
+
413
+ if bom_or_sig_available and strip_sig_or_bom is False:
414
+ cut_sequence = sig_payload + cut_sequence
415
+
416
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
417
+
418
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
419
+ break
420
+
421
+ yield chunk
lib/python3.11/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ __version__ = "3.3.2"
6
+ VERSION = __version__.split(".")
lib/python3.11/site-packages/distutils-precedence.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2638ce9e2500e572a5e0de7faed6661eb569d1b696fcba07b0dd223da5f5d224
3
+ size 151
lib/python3.11/site-packages/filelock/__init__.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A platform independent file lock that supports the with-statement.
3
+
4
+ .. autodata:: filelock.__version__
5
+ :no-value:
6
+
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ import warnings
12
+ from typing import TYPE_CHECKING
13
+
14
+ from ._api import AcquireReturnProxy, BaseFileLock
15
+ from ._error import Timeout
16
+ from ._soft import SoftFileLock
17
+ from ._unix import UnixFileLock, has_fcntl
18
+ from ._windows import WindowsFileLock
19
+ from .version import version
20
+
21
+ #: version of the project as a string
22
+ __version__: str = version
23
+
24
+
25
+ if sys.platform == "win32": # pragma: win32 cover
26
+ _FileLock: type[BaseFileLock] = WindowsFileLock
27
+ else: # pragma: win32 no cover # noqa: PLR5501
28
+ if has_fcntl:
29
+ _FileLock: type[BaseFileLock] = UnixFileLock
30
+ else:
31
+ _FileLock = SoftFileLock
32
+ if warnings is not None:
33
+ warnings.warn("only soft file lock is available", stacklevel=2)
34
+
35
+ if TYPE_CHECKING:
36
+ FileLock = SoftFileLock
37
+ else:
38
+ #: Alias for the lock, which should be used for the current platform.
39
+ FileLock = _FileLock
40
+
41
+
42
+ __all__ = [
43
+ "__version__",
44
+ "FileLock",
45
+ "SoftFileLock",
46
+ "Timeout",
47
+ "UnixFileLock",
48
+ "WindowsFileLock",
49
+ "BaseFileLock",
50
+ "AcquireReturnProxy",
51
+ ]
lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.45 kB). View file
 
lib/python3.11/site-packages/filelock/__pycache__/_api.cpython-311.pyc ADDED
Binary file (14.6 kB). View file
 
lib/python3.11/site-packages/filelock/__pycache__/_error.cpython-311.pyc ADDED
Binary file (1.98 kB). View file
 
lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc ADDED
Binary file (2.73 kB). View file