6ffebc3d3f8eb79ecf86f0d3426ce9146bd32b3e8e6f4553883490b0176b1deb
Browse files- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/INSTALLER +1 -0
- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/LICENSE +20 -0
- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/METADATA +66 -0
- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/RECORD +14 -0
- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/WHEEL +5 -0
- lib/python3.11/site-packages/certifi-2023.11.17.dist-info/top_level.txt +1 -0
- lib/python3.11/site-packages/certifi/__pycache__/core.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/certifi/cacert.pem +0 -0
- lib/python3.11/site-packages/certifi/core.py +108 -0
- lib/python3.11/site-packages/certifi/py.typed +0 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER +1 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE +21 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/METADATA +683 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/RECORD +35 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL +5 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt +2 -0
- lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt +1 -0
- lib/python3.11/site-packages/charset_normalizer/__init__.py +46 -0
- lib/python3.11/site-packages/charset_normalizer/__main__.py +4 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/__init__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/__main__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/api.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/cd.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/constant.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/legacy.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/md.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/models.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/utils.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/__pycache__/version.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/api.py +626 -0
- lib/python3.11/site-packages/charset_normalizer/cd.py +395 -0
- lib/python3.11/site-packages/charset_normalizer/cli/__init__.py +6 -0
- lib/python3.11/site-packages/charset_normalizer/cli/__main__.py +296 -0
- lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/charset_normalizer/constant.py +1995 -0
- lib/python3.11/site-packages/charset_normalizer/legacy.py +54 -0
- lib/python3.11/site-packages/charset_normalizer/md.cpython-311-darwin.so +0 -0
- lib/python3.11/site-packages/charset_normalizer/md.py +615 -0
- lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-darwin.so +0 -0
- lib/python3.11/site-packages/charset_normalizer/models.py +340 -0
- lib/python3.11/site-packages/charset_normalizer/py.typed +0 -0
- lib/python3.11/site-packages/charset_normalizer/utils.py +421 -0
- lib/python3.11/site-packages/charset_normalizer/version.py +6 -0
- lib/python3.11/site-packages/distutils-precedence.pth +3 -0
- lib/python3.11/site-packages/filelock/__init__.py +51 -0
- lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/filelock/__pycache__/_api.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/filelock/__pycache__/_error.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc +0 -0
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This package contains a modified version of ca-bundle.crt:
|
2 |
+
|
3 |
+
ca-bundle.crt -- Bundle of CA Root Certificates
|
4 |
+
|
5 |
+
This is a bundle of X.509 certificates of public Certificate Authorities
|
6 |
+
(CA). These were automatically extracted from Mozilla's root certificates
|
7 |
+
file (certdata.txt). This file can be found in the mozilla source tree:
|
8 |
+
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
9 |
+
It contains the certificates in PEM format and therefore
|
10 |
+
can be directly used with curl / libcurl / php_curl, or with
|
11 |
+
an Apache+mod_ssl webserver for SSL client authentication.
|
12 |
+
Just configure this file as the SSLCACertificateFile.#
|
13 |
+
|
14 |
+
***** BEGIN LICENSE BLOCK *****
|
15 |
+
This Source Code Form is subject to the terms of the Mozilla Public License,
|
16 |
+
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
17 |
+
one at http://mozilla.org/MPL/2.0/.
|
18 |
+
|
19 |
+
***** END LICENSE BLOCK *****
|
20 |
+
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/METADATA
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: certifi
|
3 |
+
Version: 2023.11.17
|
4 |
+
Summary: Python package for providing Mozilla's CA Bundle.
|
5 |
+
Home-page: https://github.com/certifi/python-certifi
|
6 |
+
Author: Kenneth Reitz
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MPL-2.0
|
9 |
+
Project-URL: Source, https://github.com/certifi/python-certifi
|
10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
13 |
+
Classifier: Natural Language :: English
|
14 |
+
Classifier: Programming Language :: Python
|
15 |
+
Classifier: Programming Language :: Python :: 3
|
16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17 |
+
Classifier: Programming Language :: Python :: 3.6
|
18 |
+
Classifier: Programming Language :: Python :: 3.7
|
19 |
+
Classifier: Programming Language :: Python :: 3.8
|
20 |
+
Classifier: Programming Language :: Python :: 3.9
|
21 |
+
Classifier: Programming Language :: Python :: 3.10
|
22 |
+
Classifier: Programming Language :: Python :: 3.11
|
23 |
+
Requires-Python: >=3.6
|
24 |
+
License-File: LICENSE
|
25 |
+
|
26 |
+
Certifi: Python SSL Certificates
|
27 |
+
================================
|
28 |
+
|
29 |
+
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
30 |
+
validating the trustworthiness of SSL certificates while verifying the identity
|
31 |
+
of TLS hosts. It has been extracted from the `Requests`_ project.
|
32 |
+
|
33 |
+
Installation
|
34 |
+
------------
|
35 |
+
|
36 |
+
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
37 |
+
|
38 |
+
$ pip install certifi
|
39 |
+
|
40 |
+
Usage
|
41 |
+
-----
|
42 |
+
|
43 |
+
To reference the installed certificate authority (CA) bundle, you can use the
|
44 |
+
built-in function::
|
45 |
+
|
46 |
+
>>> import certifi
|
47 |
+
|
48 |
+
>>> certifi.where()
|
49 |
+
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
50 |
+
|
51 |
+
Or from the command line::
|
52 |
+
|
53 |
+
$ python -m certifi
|
54 |
+
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
55 |
+
|
56 |
+
Enjoy!
|
57 |
+
|
58 |
+
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
59 |
+
|
60 |
+
Addition/Removal of Certificates
|
61 |
+
--------------------------------
|
62 |
+
|
63 |
+
Certifi does not support any addition/removal or other modification of the
|
64 |
+
CA trust store content. This project is intended to provide a reliable and
|
65 |
+
highly portable root of trust to python deployments. Look to upstream projects
|
66 |
+
for methods to use alternate trust.
|
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/RECORD
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
certifi-2023.11.17.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
certifi-2023.11.17.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
3 |
+
certifi-2023.11.17.dist-info/METADATA,sha256=P7BMxvbKUZTP20mLy_wc2atkEPFNVqElEzV6Mhaj3Zc,2172
|
4 |
+
certifi-2023.11.17.dist-info/RECORD,,
|
5 |
+
certifi-2023.11.17.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
6 |
+
certifi-2023.11.17.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
7 |
+
certifi/__init__.py,sha256=oYZVbNEJ66LQQamFRyuICe6FoYDmkY4j4fKEyO9D96c,94
|
8 |
+
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
9 |
+
certifi/__pycache__/__init__.cpython-311.pyc,,
|
10 |
+
certifi/__pycache__/__main__.cpython-311.pyc,,
|
11 |
+
certifi/__pycache__/core.cpython-311.pyc,,
|
12 |
+
certifi/cacert.pem,sha256=z503-oFAev4R3MDXD-YCVhQiqiNEcIwyTkUE24xsV0g,290282
|
13 |
+
certifi/core.py,sha256=lhewz0zFb2b4ULsQurElmloYwQoecjWzPqY67P8T7iM,4219
|
14 |
+
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.41.3)
|
3 |
+
Root-Is-Purelib: true
|
4 |
+
Tag: py3-none-any
|
5 |
+
|
lib/python3.11/site-packages/certifi-2023.11.17.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
certifi
|
lib/python3.11/site-packages/certifi/__pycache__/core.cpython-311.pyc
ADDED
Binary file (3.37 kB). View file
|
|
lib/python3.11/site-packages/certifi/cacert.pem
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lib/python3.11/site-packages/certifi/core.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
certifi.py
|
3 |
+
~~~~~~~~~~
|
4 |
+
|
5 |
+
This module returns the installation location of cacert.pem or its contents.
|
6 |
+
"""
|
7 |
+
import sys
|
8 |
+
|
9 |
+
|
10 |
+
if sys.version_info >= (3, 11):
|
11 |
+
|
12 |
+
from importlib.resources import as_file, files
|
13 |
+
|
14 |
+
_CACERT_CTX = None
|
15 |
+
_CACERT_PATH = None
|
16 |
+
|
17 |
+
def where() -> str:
|
18 |
+
# This is slightly terrible, but we want to delay extracting the file
|
19 |
+
# in cases where we're inside of a zipimport situation until someone
|
20 |
+
# actually calls where(), but we don't want to re-extract the file
|
21 |
+
# on every call of where(), so we'll do it once then store it in a
|
22 |
+
# global variable.
|
23 |
+
global _CACERT_CTX
|
24 |
+
global _CACERT_PATH
|
25 |
+
if _CACERT_PATH is None:
|
26 |
+
# This is slightly janky, the importlib.resources API wants you to
|
27 |
+
# manage the cleanup of this file, so it doesn't actually return a
|
28 |
+
# path, it returns a context manager that will give you the path
|
29 |
+
# when you enter it and will do any cleanup when you leave it. In
|
30 |
+
# the common case of not needing a temporary file, it will just
|
31 |
+
# return the file system location and the __exit__() is a no-op.
|
32 |
+
#
|
33 |
+
# We also have to hold onto the actual context manager, because
|
34 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
35 |
+
# we will also store that at the global level as well.
|
36 |
+
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
37 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
38 |
+
|
39 |
+
return _CACERT_PATH
|
40 |
+
|
41 |
+
def contents() -> str:
|
42 |
+
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
43 |
+
|
44 |
+
elif sys.version_info >= (3, 7):
|
45 |
+
|
46 |
+
from importlib.resources import path as get_path, read_text
|
47 |
+
|
48 |
+
_CACERT_CTX = None
|
49 |
+
_CACERT_PATH = None
|
50 |
+
|
51 |
+
def where() -> str:
|
52 |
+
# This is slightly terrible, but we want to delay extracting the
|
53 |
+
# file in cases where we're inside of a zipimport situation until
|
54 |
+
# someone actually calls where(), but we don't want to re-extract
|
55 |
+
# the file on every call of where(), so we'll do it once then store
|
56 |
+
# it in a global variable.
|
57 |
+
global _CACERT_CTX
|
58 |
+
global _CACERT_PATH
|
59 |
+
if _CACERT_PATH is None:
|
60 |
+
# This is slightly janky, the importlib.resources API wants you
|
61 |
+
# to manage the cleanup of this file, so it doesn't actually
|
62 |
+
# return a path, it returns a context manager that will give
|
63 |
+
# you the path when you enter it and will do any cleanup when
|
64 |
+
# you leave it. In the common case of not needing a temporary
|
65 |
+
# file, it will just return the file system location and the
|
66 |
+
# __exit__() is a no-op.
|
67 |
+
#
|
68 |
+
# We also have to hold onto the actual context manager, because
|
69 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
70 |
+
# we will also store that at the global level as well.
|
71 |
+
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
72 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
73 |
+
|
74 |
+
return _CACERT_PATH
|
75 |
+
|
76 |
+
def contents() -> str:
|
77 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
78 |
+
|
79 |
+
else:
|
80 |
+
import os
|
81 |
+
import types
|
82 |
+
from typing import Union
|
83 |
+
|
84 |
+
Package = Union[types.ModuleType, str]
|
85 |
+
Resource = Union[str, "os.PathLike"]
|
86 |
+
|
87 |
+
# This fallback will work for Python versions prior to 3.7 that lack the
|
88 |
+
# importlib.resources module but relies on the existing `where` function
|
89 |
+
# so won't address issues with environments like PyOxidizer that don't set
|
90 |
+
# __file__ on modules.
|
91 |
+
def read_text(
|
92 |
+
package: Package,
|
93 |
+
resource: Resource,
|
94 |
+
encoding: str = 'utf-8',
|
95 |
+
errors: str = 'strict'
|
96 |
+
) -> str:
|
97 |
+
with open(where(), encoding=encoding) as data:
|
98 |
+
return data.read()
|
99 |
+
|
100 |
+
# If we don't have importlib.resources, then we will just do the old logic
|
101 |
+
# of assuming we're on the filesystem and munge the path directly.
|
102 |
+
def where() -> str:
|
103 |
+
f = os.path.dirname(__file__)
|
104 |
+
|
105 |
+
return os.path.join(f, "cacert.pem")
|
106 |
+
|
107 |
+
def contents() -> str:
|
108 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
lib/python3.11/site-packages/certifi/py.typed
ADDED
File without changes
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2019 TAHRI Ahmed R.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/METADATA
ADDED
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: charset-normalizer
|
3 |
+
Version: 3.3.2
|
4 |
+
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
5 |
+
Home-page: https://github.com/Ousret/charset_normalizer
|
6 |
+
Author: Ahmed TAHRI
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MIT
|
9 |
+
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
10 |
+
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
11 |
+
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
12 |
+
Classifier: Development Status :: 5 - Production/Stable
|
13 |
+
Classifier: License :: OSI Approved :: MIT License
|
14 |
+
Classifier: Intended Audience :: Developers
|
15 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
16 |
+
Classifier: Operating System :: OS Independent
|
17 |
+
Classifier: Programming Language :: Python
|
18 |
+
Classifier: Programming Language :: Python :: 3
|
19 |
+
Classifier: Programming Language :: Python :: 3.7
|
20 |
+
Classifier: Programming Language :: Python :: 3.8
|
21 |
+
Classifier: Programming Language :: Python :: 3.9
|
22 |
+
Classifier: Programming Language :: Python :: 3.10
|
23 |
+
Classifier: Programming Language :: Python :: 3.11
|
24 |
+
Classifier: Programming Language :: Python :: 3.12
|
25 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
26 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
27 |
+
Classifier: Topic :: Utilities
|
28 |
+
Classifier: Typing :: Typed
|
29 |
+
Requires-Python: >=3.7.0
|
30 |
+
Description-Content-Type: text/markdown
|
31 |
+
License-File: LICENSE
|
32 |
+
Provides-Extra: unicode_backport
|
33 |
+
|
34 |
+
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
35 |
+
|
36 |
+
<p align="center">
|
37 |
+
<sup>The Real First Universal Charset Detector</sup><br>
|
38 |
+
<a href="https://pypi.org/project/charset-normalizer">
|
39 |
+
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
40 |
+
</a>
|
41 |
+
<a href="https://pepy.tech/project/charset-normalizer/">
|
42 |
+
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
43 |
+
</a>
|
44 |
+
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
45 |
+
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
46 |
+
</a>
|
47 |
+
</p>
|
48 |
+
<p align="center">
|
49 |
+
<sup><i>Featured Packages</i></sup><br>
|
50 |
+
<a href="https://github.com/jawah/niquests">
|
51 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
|
52 |
+
</a>
|
53 |
+
<a href="https://github.com/jawah/wassima">
|
54 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
55 |
+
</a>
|
56 |
+
</p>
|
57 |
+
<p align="center">
|
58 |
+
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
59 |
+
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
60 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
61 |
+
</a>
|
62 |
+
</p>
|
63 |
+
|
64 |
+
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
65 |
+
> I'm trying to resolve the issue by taking a new approach.
|
66 |
+
> All IANA character set names for which the Python core library provides codecs are supported.
|
67 |
+
|
68 |
+
<p align="center">
|
69 |
+
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
70 |
+
</p>
|
71 |
+
|
72 |
+
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
73 |
+
|
74 |
+
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
75 |
+
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
76 |
+
| `Fast` | ❌ | ✅ | ✅ |
|
77 |
+
| `Universal**` | ❌ | ✅ | ❌ |
|
78 |
+
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
79 |
+
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
80 |
+
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
81 |
+
| `Native Python` | ✅ | ✅ | ❌ |
|
82 |
+
| `Detect spoken language` | ❌ | ✅ | N/A |
|
83 |
+
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
84 |
+
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
85 |
+
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
86 |
+
|
87 |
+
<p align="center">
|
88 |
+
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
89 |
+
</p>
|
90 |
+
|
91 |
+
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
92 |
+
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
93 |
+
|
94 |
+
## ⚡ Performance
|
95 |
+
|
96 |
+
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
97 |
+
|
98 |
+
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
99 |
+
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
100 |
+
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
101 |
+
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
102 |
+
|
103 |
+
| Package | 99th percentile | 95th percentile | 50th percentile |
|
104 |
+
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
105 |
+
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
106 |
+
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
107 |
+
|
108 |
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
109 |
+
|
110 |
+
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
111 |
+
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
112 |
+
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
113 |
+
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
114 |
+
> (eg. Supported Encoding) Challenge-them if you want.
|
115 |
+
|
116 |
+
## ✨ Installation
|
117 |
+
|
118 |
+
Using pip:
|
119 |
+
|
120 |
+
```sh
|
121 |
+
pip install charset-normalizer -U
|
122 |
+
```
|
123 |
+
|
124 |
+
## 🚀 Basic Usage
|
125 |
+
|
126 |
+
### CLI
|
127 |
+
This package comes with a CLI.
|
128 |
+
|
129 |
+
```
|
130 |
+
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
131 |
+
file [file ...]
|
132 |
+
|
133 |
+
The Real First Universal Charset Detector. Discover originating encoding used
|
134 |
+
on text file. Normalize text to unicode.
|
135 |
+
|
136 |
+
positional arguments:
|
137 |
+
files File(s) to be analysed
|
138 |
+
|
139 |
+
optional arguments:
|
140 |
+
-h, --help show this help message and exit
|
141 |
+
-v, --verbose Display complementary information about file if any.
|
142 |
+
Stdout will contain logs about the detection process.
|
143 |
+
-a, --with-alternative
|
144 |
+
Output complementary possibilities if any. Top-level
|
145 |
+
JSON WILL be a list.
|
146 |
+
-n, --normalize Permit to normalize input file. If not set, program
|
147 |
+
does not write anything.
|
148 |
+
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
149 |
+
JSON output.
|
150 |
+
-r, --replace Replace file when trying to normalize it instead of
|
151 |
+
creating a new one.
|
152 |
+
-f, --force Replace file without asking if you are sure, use this
|
153 |
+
flag with caution.
|
154 |
+
-t THRESHOLD, --threshold THRESHOLD
|
155 |
+
Define a custom maximum amount of chaos allowed in
|
156 |
+
decoded content. 0. <= chaos <= 1.
|
157 |
+
--version Show version information and exit.
|
158 |
+
```
|
159 |
+
|
160 |
+
```bash
|
161 |
+
normalizer ./data/sample.1.fr.srt
|
162 |
+
```
|
163 |
+
|
164 |
+
or
|
165 |
+
|
166 |
+
```bash
|
167 |
+
python -m charset_normalizer ./data/sample.1.fr.srt
|
168 |
+
```
|
169 |
+
|
170 |
+
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
171 |
+
|
172 |
+
```json
|
173 |
+
{
|
174 |
+
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
175 |
+
"encoding": "cp1252",
|
176 |
+
"encoding_aliases": [
|
177 |
+
"1252",
|
178 |
+
"windows_1252"
|
179 |
+
],
|
180 |
+
"alternative_encodings": [
|
181 |
+
"cp1254",
|
182 |
+
"cp1256",
|
183 |
+
"cp1258",
|
184 |
+
"iso8859_14",
|
185 |
+
"iso8859_15",
|
186 |
+
"iso8859_16",
|
187 |
+
"iso8859_3",
|
188 |
+
"iso8859_9",
|
189 |
+
"latin_1",
|
190 |
+
"mbcs"
|
191 |
+
],
|
192 |
+
"language": "French",
|
193 |
+
"alphabets": [
|
194 |
+
"Basic Latin",
|
195 |
+
"Latin-1 Supplement"
|
196 |
+
],
|
197 |
+
"has_sig_or_bom": false,
|
198 |
+
"chaos": 0.149,
|
199 |
+
"coherence": 97.152,
|
200 |
+
"unicode_path": null,
|
201 |
+
"is_preferred": true
|
202 |
+
}
|
203 |
+
```
|
204 |
+
|
205 |
+
### Python
|
206 |
+
*Just print out normalized text*
|
207 |
+
```python
|
208 |
+
from charset_normalizer import from_path
|
209 |
+
|
210 |
+
results = from_path('./my_subtitle.srt')
|
211 |
+
|
212 |
+
print(str(results.best()))
|
213 |
+
```
|
214 |
+
|
215 |
+
*Upgrade your code without effort*
|
216 |
+
```python
|
217 |
+
from charset_normalizer import detect
|
218 |
+
```
|
219 |
+
|
220 |
+
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
221 |
+
|
222 |
+
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
223 |
+
|
224 |
+
## 😇 Why
|
225 |
+
|
226 |
+
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
227 |
+
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
228 |
+
|
229 |
+
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
230 |
+
produce **two identical rendered string.**
|
231 |
+
What I want is to get readable text, the best I can.
|
232 |
+
|
233 |
+
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
234 |
+
|
235 |
+
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
236 |
+
|
237 |
+
## 🍰 How
|
238 |
+
|
239 |
+
- Discard all charset encoding table that could not fit the binary content.
|
240 |
+
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
241 |
+
- Extract matches with the lowest mess detected.
|
242 |
+
- Additionally, we measure coherence / probe for a language.
|
243 |
+
|
244 |
+
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
245 |
+
|
246 |
+
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
247 |
+
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
248 |
+
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
249 |
+
improve or rewrite it.
|
250 |
+
|
251 |
+
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
252 |
+
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
253 |
+
|
254 |
+
## ⚡ Known limitations
|
255 |
+
|
256 |
+
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
257 |
+
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
258 |
+
|
259 |
+
## ⚠️ About Python EOLs
|
260 |
+
|
261 |
+
**If you are running:**
|
262 |
+
|
263 |
+
- Python >=2.7,<3.5: Unsupported
|
264 |
+
- Python 3.5: charset-normalizer < 2.1
|
265 |
+
- Python 3.6: charset-normalizer < 3.1
|
266 |
+
- Python 3.7: charset-normalizer < 4.0
|
267 |
+
|
268 |
+
Upgrade your Python interpreter as soon as possible.
|
269 |
+
|
270 |
+
## 👤 Contributing
|
271 |
+
|
272 |
+
Contributions, issues and feature requests are very much welcome.<br />
|
273 |
+
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
274 |
+
|
275 |
+
## 📝 License
|
276 |
+
|
277 |
+
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
278 |
+
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
279 |
+
|
280 |
+
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
281 |
+
|
282 |
+
## 💼 For Enterprise
|
283 |
+
|
284 |
+
Professional support for charset-normalizer is available as part of the [Tidelift
|
285 |
+
Subscription][1]. Tidelift gives software development teams a single source for
|
286 |
+
purchasing and maintaining their software, with professional grade assurances
|
287 |
+
from the experts who know it best, while seamlessly integrating with existing
|
288 |
+
tools.
|
289 |
+
|
290 |
+
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
291 |
+
|
292 |
+
# Changelog
|
293 |
+
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
294 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
295 |
+
|
296 |
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
297 |
+
|
298 |
+
### Fixed
|
299 |
+
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
300 |
+
- Regression on some detection case showcased in the documentation (#371)
|
301 |
+
|
302 |
+
### Added
|
303 |
+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
304 |
+
|
305 |
+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
306 |
+
|
307 |
+
### Changed
|
308 |
+
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
309 |
+
- Improved the general detection reliability based on reports from the community
|
310 |
+
|
311 |
+
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
312 |
+
|
313 |
+
### Added
|
314 |
+
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
315 |
+
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
316 |
+
|
317 |
+
### Removed
|
318 |
+
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
319 |
+
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
320 |
+
|
321 |
+
### Changed
|
322 |
+
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
323 |
+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
324 |
+
|
325 |
+
### Fixed
|
326 |
+
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
327 |
+
|
328 |
+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
329 |
+
|
330 |
+
### Changed
|
331 |
+
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
332 |
+
- Minor improvement over the global detection reliability
|
333 |
+
|
334 |
+
### Added
|
335 |
+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
336 |
+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
337 |
+
- Explicit support for Python 3.12
|
338 |
+
|
339 |
+
### Fixed
|
340 |
+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
341 |
+
|
342 |
+
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
343 |
+
|
344 |
+
### Added
|
345 |
+
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
346 |
+
|
347 |
+
### Removed
|
348 |
+
- Support for Python 3.6 (PR #260)
|
349 |
+
|
350 |
+
### Changed
|
351 |
+
- Optional speedup provided by mypy/c 1.0.1
|
352 |
+
|
353 |
+
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
354 |
+
|
355 |
+
### Fixed
|
356 |
+
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
357 |
+
|
358 |
+
### Changed
|
359 |
+
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
360 |
+
|
361 |
+
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
362 |
+
|
363 |
+
### Added
|
364 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
365 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
366 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
367 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
368 |
+
|
369 |
+
### Changed
|
370 |
+
- Build with static metadata using 'build' frontend
|
371 |
+
- Make the language detection stricter
|
372 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
373 |
+
|
374 |
+
### Fixed
|
375 |
+
- CLI with opt --normalize fail when using full path for files
|
376 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
377 |
+
- Sphinx warnings when generating the documentation
|
378 |
+
|
379 |
+
### Removed
|
380 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
381 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
382 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
383 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
384 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
385 |
+
- Breaking: Top-level function `normalize`
|
386 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
387 |
+
- Support for the backport `unicodedata2`
|
388 |
+
|
389 |
+
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
390 |
+
|
391 |
+
### Added
|
392 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
393 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
394 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
395 |
+
|
396 |
+
### Changed
|
397 |
+
- Build with static metadata using 'build' frontend
|
398 |
+
- Make the language detection stricter
|
399 |
+
|
400 |
+
### Fixed
|
401 |
+
- CLI with opt --normalize fail when using full path for files
|
402 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
403 |
+
|
404 |
+
### Removed
|
405 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
406 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
407 |
+
|
408 |
+
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
409 |
+
|
410 |
+
### Added
|
411 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
412 |
+
|
413 |
+
### Removed
|
414 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
415 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
416 |
+
|
417 |
+
### Fixed
|
418 |
+
- Sphinx warnings when generating the documentation
|
419 |
+
|
420 |
+
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
421 |
+
|
422 |
+
### Changed
|
423 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
424 |
+
|
425 |
+
### Removed
|
426 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
427 |
+
- Breaking: Top-level function `normalize`
|
428 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
429 |
+
- Support for the backport `unicodedata2`
|
430 |
+
|
431 |
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
432 |
+
|
433 |
+
### Deprecated
|
434 |
+
- Function `normalize` scheduled for removal in 3.0
|
435 |
+
|
436 |
+
### Changed
|
437 |
+
- Removed useless call to decode in fn is_unprintable (#206)
|
438 |
+
|
439 |
+
### Fixed
|
440 |
+
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
441 |
+
|
442 |
+
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
443 |
+
|
444 |
+
### Added
|
445 |
+
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
446 |
+
|
447 |
+
### Changed
|
448 |
+
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
449 |
+
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
450 |
+
|
451 |
+
### Fixed
|
452 |
+
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
453 |
+
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
454 |
+
|
455 |
+
### Removed
|
456 |
+
- Support for Python 3.5 (PR #192)
|
457 |
+
|
458 |
+
### Deprecated
|
459 |
+
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
460 |
+
|
461 |
+
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
462 |
+
|
463 |
+
### Fixed
|
464 |
+
- ASCII miss-detection on rare cases (PR #170)
|
465 |
+
|
466 |
+
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
467 |
+
|
468 |
+
### Added
|
469 |
+
- Explicit support for Python 3.11 (PR #164)
|
470 |
+
|
471 |
+
### Changed
|
472 |
+
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
473 |
+
|
474 |
+
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
475 |
+
|
476 |
+
### Fixed
|
477 |
+
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
478 |
+
|
479 |
+
### Changed
|
480 |
+
- Skipping the language-detection (CD) on ASCII (PR #155)
|
481 |
+
|
482 |
+
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
483 |
+
|
484 |
+
### Changed
|
485 |
+
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
486 |
+
|
487 |
+
### Fixed
|
488 |
+
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
489 |
+
|
490 |
+
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
491 |
+
### Changed
|
492 |
+
- Improvement over Vietnamese detection (PR #126)
|
493 |
+
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
494 |
+
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
495 |
+
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
496 |
+
- Code style as refactored by Sourcery-AI (PR #131)
|
497 |
+
- Minor adjustment on the MD around european words (PR #133)
|
498 |
+
- Remove and replace SRTs from assets / tests (PR #139)
|
499 |
+
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
500 |
+
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
501 |
+
|
502 |
+
### Fixed
|
503 |
+
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
504 |
+
- Avoid using too insignificant chunk (PR #137)
|
505 |
+
|
506 |
+
### Added
|
507 |
+
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
508 |
+
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
509 |
+
|
510 |
+
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
511 |
+
### Added
|
512 |
+
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
513 |
+
|
514 |
+
### Changed
|
515 |
+
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
516 |
+
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
517 |
+
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
518 |
+
- Various detection improvement (MD+CD) (PR #117)
|
519 |
+
|
520 |
+
### Removed
|
521 |
+
- Remove redundant logging entry about detected language(s) (PR #115)
|
522 |
+
|
523 |
+
### Fixed
|
524 |
+
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
525 |
+
|
526 |
+
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
527 |
+
### Fixed
|
528 |
+
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
529 |
+
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
530 |
+
|
531 |
+
### Changed
|
532 |
+
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
533 |
+
|
534 |
+
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
535 |
+
### Changed
|
536 |
+
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
537 |
+
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
538 |
+
- The Unicode detection is slightly improved (PR #93)
|
539 |
+
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
540 |
+
|
541 |
+
### Removed
|
542 |
+
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
543 |
+
|
544 |
+
### Fixed
|
545 |
+
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
546 |
+
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
547 |
+
- The MANIFEST.in was not exhaustive (PR #78)
|
548 |
+
|
549 |
+
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
550 |
+
### Fixed
|
551 |
+
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
552 |
+
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
553 |
+
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
554 |
+
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
555 |
+
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
556 |
+
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
557 |
+
|
558 |
+
### Changed
|
559 |
+
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
560 |
+
- Allow fallback on specified encoding if any (PR #71)
|
561 |
+
|
562 |
+
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
563 |
+
### Changed
|
564 |
+
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
565 |
+
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
566 |
+
|
567 |
+
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
568 |
+
### Fixed
|
569 |
+
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
570 |
+
|
571 |
+
### Changed
|
572 |
+
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
573 |
+
|
574 |
+
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
575 |
+
### Fixed
|
576 |
+
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
577 |
+
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
578 |
+
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
579 |
+
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
580 |
+
|
581 |
+
### Changed
|
582 |
+
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
583 |
+
|
584 |
+
### Added
|
585 |
+
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
586 |
+
|
587 |
+
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
588 |
+
### Changed
|
589 |
+
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
590 |
+
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
591 |
+
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
592 |
+
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
593 |
+
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
594 |
+
- utf_7 detection has been reinstated.
|
595 |
+
|
596 |
+
### Removed
|
597 |
+
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
598 |
+
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
599 |
+
- The exception hook on UnicodeDecodeError has been removed.
|
600 |
+
|
601 |
+
### Deprecated
|
602 |
+
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
603 |
+
|
604 |
+
### Fixed
|
605 |
+
- The CLI output used the relative path of the file(s). Should be absolute.
|
606 |
+
|
607 |
+
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
608 |
+
### Fixed
|
609 |
+
- Logger configuration/usage no longer conflict with others (PR #44)
|
610 |
+
|
611 |
+
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
612 |
+
### Removed
|
613 |
+
- Using standard logging instead of using the package loguru.
|
614 |
+
- Dropping nose test framework in favor of the maintained pytest.
|
615 |
+
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
616 |
+
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
617 |
+
- Stop support for UTF-7 that does not contain a SIG.
|
618 |
+
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
619 |
+
|
620 |
+
### Fixed
|
621 |
+
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
622 |
+
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
623 |
+
|
624 |
+
### Changed
|
625 |
+
- Improving the package final size by compressing frequencies.json.
|
626 |
+
- Huge improvement over the larges payload.
|
627 |
+
|
628 |
+
### Added
|
629 |
+
- CLI now produces JSON consumable output.
|
630 |
+
- Return ASCII if given sequences fit. Given reasonable confidence.
|
631 |
+
|
632 |
+
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
633 |
+
|
634 |
+
### Fixed
|
635 |
+
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
636 |
+
|
637 |
+
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
638 |
+
|
639 |
+
### Fixed
|
640 |
+
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
641 |
+
|
642 |
+
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
643 |
+
|
644 |
+
### Fixed
|
645 |
+
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
646 |
+
|
647 |
+
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
648 |
+
|
649 |
+
### Changed
|
650 |
+
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
651 |
+
|
652 |
+
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
653 |
+
|
654 |
+
### Fixed
|
655 |
+
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
656 |
+
|
657 |
+
### Changed
|
658 |
+
- Dependencies refactoring, constraints revised.
|
659 |
+
|
660 |
+
### Added
|
661 |
+
- Add python 3.9 and 3.10 to the supported interpreters
|
662 |
+
|
663 |
+
MIT License
|
664 |
+
|
665 |
+
Copyright (c) 2019 TAHRI Ahmed R.
|
666 |
+
|
667 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
668 |
+
of this software and associated documentation files (the "Software"), to deal
|
669 |
+
in the Software without restriction, including without limitation the rights
|
670 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
671 |
+
copies of the Software, and to permit persons to whom the Software is
|
672 |
+
furnished to do so, subject to the following conditions:
|
673 |
+
|
674 |
+
The above copyright notice and this permission notice shall be included in all
|
675 |
+
copies or substantial portions of the Software.
|
676 |
+
|
677 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
678 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
679 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
680 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
681 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
682 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
683 |
+
SOFTWARE.
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/RECORD
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
../../../bin/normalizer,sha256=YnnjWFnBo-5ncwqWx_Z70rELAOBcQEMlV7bxTgbUYVY,296
|
2 |
+
charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
3 |
+
charset_normalizer-3.3.2.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
|
4 |
+
charset_normalizer-3.3.2.dist-info/METADATA,sha256=cfLhl5A6SI-F0oclm8w8ux9wshL1nipdeCdVnYb4AaA,33550
|
5 |
+
charset_normalizer-3.3.2.dist-info/RECORD,,
|
6 |
+
charset_normalizer-3.3.2.dist-info/WHEEL,sha256=eaDTbMedWofVq8IZjew9qeAkoA5Sw2MOU2ppdIRr1Jg,110
|
7 |
+
charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
8 |
+
charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
9 |
+
charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
|
10 |
+
charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
|
11 |
+
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
|
12 |
+
charset_normalizer/__pycache__/__main__.cpython-311.pyc,,
|
13 |
+
charset_normalizer/__pycache__/api.cpython-311.pyc,,
|
14 |
+
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
|
15 |
+
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
|
16 |
+
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
|
17 |
+
charset_normalizer/__pycache__/md.cpython-311.pyc,,
|
18 |
+
charset_normalizer/__pycache__/models.cpython-311.pyc,,
|
19 |
+
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
|
20 |
+
charset_normalizer/__pycache__/version.cpython-311.pyc,,
|
21 |
+
charset_normalizer/api.py,sha256=WOlWjy6wT8SeMYFpaGbXZFN1TMXa-s8vZYfkL4G29iQ,21097
|
22 |
+
charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
|
23 |
+
charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
|
24 |
+
charset_normalizer/cli/__main__.py,sha256=2F-xURZJzo063Ye-2RLJ2wcmURpbKeAzKwpiws65dAs,9744
|
25 |
+
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
|
26 |
+
charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc,,
|
27 |
+
charset_normalizer/constant.py,sha256=p0IsOVcEbPWYPOdWhnhRbjK1YVBy6fs05C5vKC-zoxU,40481
|
28 |
+
charset_normalizer/legacy.py,sha256=T-QuVMsMeDiQEk8WSszMrzVJg_14AMeSkmHdRYhdl1k,2071
|
29 |
+
charset_normalizer/md.cpython-311-darwin.so,sha256=zbs-p3GrSygP9-4v4GVAUcyRpreXASFbQqgK9rvFoKw,50117
|
30 |
+
charset_normalizer/md.py,sha256=NkSuVLK13_a8c7BxZ4cGIQ5vOtGIWOdh22WZEvjp-7U,19624
|
31 |
+
charset_normalizer/md__mypyc.cpython-311-darwin.so,sha256=5u-KvFhpxi_WDpF0bB0tfYS2z7PzQ08aO8DAOMiMAXI,232636
|
32 |
+
charset_normalizer/models.py,sha256=I5i0s4aKCCgLPY2tUY3pwkgFA-BUbbNxQ7hVkVTt62s,11624
|
33 |
+
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34 |
+
charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
|
35 |
+
charset_normalizer/version.py,sha256=iHKUfHD3kDRSyrh_BN2ojh43TA5-UZQjvbVIEFfpHDs,79
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.41.2)
|
3 |
+
Root-Is-Purelib: false
|
4 |
+
Tag: cp311-cp311-macosx_11_0_arm64
|
5 |
+
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[console_scripts]
|
2 |
+
normalizer = charset_normalizer.cli:cli_detect
|
lib/python3.11/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
charset_normalizer
|
lib/python3.11/site-packages/charset_normalizer/__init__.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Charset-Normalizer
|
4 |
+
~~~~~~~~~~~~~~
|
5 |
+
The Real First Universal Charset Detector.
|
6 |
+
A library that helps you read text from an unknown charset encoding.
|
7 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
8 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
9 |
+
|
10 |
+
Basic usage:
|
11 |
+
>>> from charset_normalizer import from_bytes
|
12 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
13 |
+
>>> best_guess = results.best()
|
14 |
+
>>> str(best_guess)
|
15 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
16 |
+
|
17 |
+
Others methods and usages are available - see the full documentation
|
18 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
19 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
20 |
+
:license: MIT, see LICENSE for more details.
|
21 |
+
"""
|
22 |
+
import logging
|
23 |
+
|
24 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
25 |
+
from .legacy import detect
|
26 |
+
from .models import CharsetMatch, CharsetMatches
|
27 |
+
from .utils import set_logging_handler
|
28 |
+
from .version import VERSION, __version__
|
29 |
+
|
30 |
+
__all__ = (
|
31 |
+
"from_fp",
|
32 |
+
"from_path",
|
33 |
+
"from_bytes",
|
34 |
+
"is_binary",
|
35 |
+
"detect",
|
36 |
+
"CharsetMatch",
|
37 |
+
"CharsetMatches",
|
38 |
+
"__version__",
|
39 |
+
"VERSION",
|
40 |
+
"set_logging_handler",
|
41 |
+
)
|
42 |
+
|
43 |
+
# Attach a NullHandler to the top level logger by default
|
44 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
45 |
+
|
46 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
lib/python3.11/site-packages/charset_normalizer/__main__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .cli import cli_detect
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
cli_detect()
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (1.88 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/__main__.cpython-311.pyc
ADDED
Binary file (369 Bytes). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/api.cpython-311.pyc
ADDED
Binary file (20.5 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/cd.cpython-311.pyc
ADDED
Binary file (16.2 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/constant.cpython-311.pyc
ADDED
Binary file (43.7 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/legacy.cpython-311.pyc
ADDED
Binary file (2.8 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/md.cpython-311.pyc
ADDED
Binary file (27.4 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/models.cpython-311.pyc
ADDED
Binary file (18.1 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/utils.cpython-311.pyc
ADDED
Binary file (16.4 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/__pycache__/version.cpython-311.pyc
ADDED
Binary file (375 Bytes). View file
|
|
lib/python3.11/site-packages/charset_normalizer/api.py
ADDED
@@ -0,0 +1,626 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from os import PathLike
|
3 |
+
from typing import BinaryIO, List, Optional, Set, Union
|
4 |
+
|
5 |
+
from .cd import (
|
6 |
+
coherence_ratio,
|
7 |
+
encoding_languages,
|
8 |
+
mb_encoding_languages,
|
9 |
+
merge_coherence_ratios,
|
10 |
+
)
|
11 |
+
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
12 |
+
from .md import mess_ratio
|
13 |
+
from .models import CharsetMatch, CharsetMatches
|
14 |
+
from .utils import (
|
15 |
+
any_specified_encoding,
|
16 |
+
cut_sequence_chunks,
|
17 |
+
iana_name,
|
18 |
+
identify_sig_or_bom,
|
19 |
+
is_cp_similar,
|
20 |
+
is_multi_byte_encoding,
|
21 |
+
should_strip_sig_or_bom,
|
22 |
+
)
|
23 |
+
|
24 |
+
# Will most likely be controversial
|
25 |
+
# logging.addLevelName(TRACE, "TRACE")
|
26 |
+
logger = logging.getLogger("charset_normalizer")
|
27 |
+
explain_handler = logging.StreamHandler()
|
28 |
+
explain_handler.setFormatter(
|
29 |
+
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def from_bytes(
|
34 |
+
sequences: Union[bytes, bytearray],
|
35 |
+
steps: int = 5,
|
36 |
+
chunk_size: int = 512,
|
37 |
+
threshold: float = 0.2,
|
38 |
+
cp_isolation: Optional[List[str]] = None,
|
39 |
+
cp_exclusion: Optional[List[str]] = None,
|
40 |
+
preemptive_behaviour: bool = True,
|
41 |
+
explain: bool = False,
|
42 |
+
language_threshold: float = 0.1,
|
43 |
+
enable_fallback: bool = True,
|
44 |
+
) -> CharsetMatches:
|
45 |
+
"""
|
46 |
+
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
47 |
+
If there is no results, it is a strong indicator that the source is binary/not text.
|
48 |
+
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
49 |
+
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
50 |
+
|
51 |
+
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
52 |
+
but never take it for granted. Can improve the performance.
|
53 |
+
|
54 |
+
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
55 |
+
purpose.
|
56 |
+
|
57 |
+
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
58 |
+
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
59 |
+
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
60 |
+
Custom logging format and handler can be set manually.
|
61 |
+
"""
|
62 |
+
|
63 |
+
if not isinstance(sequences, (bytearray, bytes)):
|
64 |
+
raise TypeError(
|
65 |
+
"Expected object of type bytes or bytearray, got: {0}".format(
|
66 |
+
type(sequences)
|
67 |
+
)
|
68 |
+
)
|
69 |
+
|
70 |
+
if explain:
|
71 |
+
previous_logger_level: int = logger.level
|
72 |
+
logger.addHandler(explain_handler)
|
73 |
+
logger.setLevel(TRACE)
|
74 |
+
|
75 |
+
length: int = len(sequences)
|
76 |
+
|
77 |
+
if length == 0:
|
78 |
+
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
79 |
+
if explain:
|
80 |
+
logger.removeHandler(explain_handler)
|
81 |
+
logger.setLevel(previous_logger_level or logging.WARNING)
|
82 |
+
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
83 |
+
|
84 |
+
if cp_isolation is not None:
|
85 |
+
logger.log(
|
86 |
+
TRACE,
|
87 |
+
"cp_isolation is set. use this flag for debugging purpose. "
|
88 |
+
"limited list of encoding allowed : %s.",
|
89 |
+
", ".join(cp_isolation),
|
90 |
+
)
|
91 |
+
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
92 |
+
else:
|
93 |
+
cp_isolation = []
|
94 |
+
|
95 |
+
if cp_exclusion is not None:
|
96 |
+
logger.log(
|
97 |
+
TRACE,
|
98 |
+
"cp_exclusion is set. use this flag for debugging purpose. "
|
99 |
+
"limited list of encoding excluded : %s.",
|
100 |
+
", ".join(cp_exclusion),
|
101 |
+
)
|
102 |
+
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
103 |
+
else:
|
104 |
+
cp_exclusion = []
|
105 |
+
|
106 |
+
if length <= (chunk_size * steps):
|
107 |
+
logger.log(
|
108 |
+
TRACE,
|
109 |
+
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
110 |
+
steps,
|
111 |
+
chunk_size,
|
112 |
+
length,
|
113 |
+
)
|
114 |
+
steps = 1
|
115 |
+
chunk_size = length
|
116 |
+
|
117 |
+
if steps > 1 and length / steps < chunk_size:
|
118 |
+
chunk_size = int(length / steps)
|
119 |
+
|
120 |
+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
121 |
+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
122 |
+
|
123 |
+
if is_too_small_sequence:
|
124 |
+
logger.log(
|
125 |
+
TRACE,
|
126 |
+
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
127 |
+
length
|
128 |
+
),
|
129 |
+
)
|
130 |
+
elif is_too_large_sequence:
|
131 |
+
logger.log(
|
132 |
+
TRACE,
|
133 |
+
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
134 |
+
length
|
135 |
+
),
|
136 |
+
)
|
137 |
+
|
138 |
+
prioritized_encodings: List[str] = []
|
139 |
+
|
140 |
+
specified_encoding: Optional[str] = (
|
141 |
+
any_specified_encoding(sequences) if preemptive_behaviour else None
|
142 |
+
)
|
143 |
+
|
144 |
+
if specified_encoding is not None:
|
145 |
+
prioritized_encodings.append(specified_encoding)
|
146 |
+
logger.log(
|
147 |
+
TRACE,
|
148 |
+
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
149 |
+
specified_encoding,
|
150 |
+
)
|
151 |
+
|
152 |
+
tested: Set[str] = set()
|
153 |
+
tested_but_hard_failure: List[str] = []
|
154 |
+
tested_but_soft_failure: List[str] = []
|
155 |
+
|
156 |
+
fallback_ascii: Optional[CharsetMatch] = None
|
157 |
+
fallback_u8: Optional[CharsetMatch] = None
|
158 |
+
fallback_specified: Optional[CharsetMatch] = None
|
159 |
+
|
160 |
+
results: CharsetMatches = CharsetMatches()
|
161 |
+
|
162 |
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
163 |
+
|
164 |
+
if sig_encoding is not None:
|
165 |
+
prioritized_encodings.append(sig_encoding)
|
166 |
+
logger.log(
|
167 |
+
TRACE,
|
168 |
+
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
169 |
+
len(sig_payload),
|
170 |
+
sig_encoding,
|
171 |
+
)
|
172 |
+
|
173 |
+
prioritized_encodings.append("ascii")
|
174 |
+
|
175 |
+
if "utf_8" not in prioritized_encodings:
|
176 |
+
prioritized_encodings.append("utf_8")
|
177 |
+
|
178 |
+
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
179 |
+
if cp_isolation and encoding_iana not in cp_isolation:
|
180 |
+
continue
|
181 |
+
|
182 |
+
if cp_exclusion and encoding_iana in cp_exclusion:
|
183 |
+
continue
|
184 |
+
|
185 |
+
if encoding_iana in tested:
|
186 |
+
continue
|
187 |
+
|
188 |
+
tested.add(encoding_iana)
|
189 |
+
|
190 |
+
decoded_payload: Optional[str] = None
|
191 |
+
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
192 |
+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
193 |
+
encoding_iana
|
194 |
+
)
|
195 |
+
|
196 |
+
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
197 |
+
logger.log(
|
198 |
+
TRACE,
|
199 |
+
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
200 |
+
encoding_iana,
|
201 |
+
)
|
202 |
+
continue
|
203 |
+
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
204 |
+
logger.log(
|
205 |
+
TRACE,
|
206 |
+
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
207 |
+
encoding_iana,
|
208 |
+
)
|
209 |
+
continue
|
210 |
+
|
211 |
+
try:
|
212 |
+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
213 |
+
except (ModuleNotFoundError, ImportError):
|
214 |
+
logger.log(
|
215 |
+
TRACE,
|
216 |
+
"Encoding %s does not provide an IncrementalDecoder",
|
217 |
+
encoding_iana,
|
218 |
+
)
|
219 |
+
continue
|
220 |
+
|
221 |
+
try:
|
222 |
+
if is_too_large_sequence and is_multi_byte_decoder is False:
|
223 |
+
str(
|
224 |
+
sequences[: int(50e4)]
|
225 |
+
if strip_sig_or_bom is False
|
226 |
+
else sequences[len(sig_payload) : int(50e4)],
|
227 |
+
encoding=encoding_iana,
|
228 |
+
)
|
229 |
+
else:
|
230 |
+
decoded_payload = str(
|
231 |
+
sequences
|
232 |
+
if strip_sig_or_bom is False
|
233 |
+
else sequences[len(sig_payload) :],
|
234 |
+
encoding=encoding_iana,
|
235 |
+
)
|
236 |
+
except (UnicodeDecodeError, LookupError) as e:
|
237 |
+
if not isinstance(e, LookupError):
|
238 |
+
logger.log(
|
239 |
+
TRACE,
|
240 |
+
"Code page %s does not fit given bytes sequence at ALL. %s",
|
241 |
+
encoding_iana,
|
242 |
+
str(e),
|
243 |
+
)
|
244 |
+
tested_but_hard_failure.append(encoding_iana)
|
245 |
+
continue
|
246 |
+
|
247 |
+
similar_soft_failure_test: bool = False
|
248 |
+
|
249 |
+
for encoding_soft_failed in tested_but_soft_failure:
|
250 |
+
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
251 |
+
similar_soft_failure_test = True
|
252 |
+
break
|
253 |
+
|
254 |
+
if similar_soft_failure_test:
|
255 |
+
logger.log(
|
256 |
+
TRACE,
|
257 |
+
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
258 |
+
encoding_iana,
|
259 |
+
encoding_soft_failed,
|
260 |
+
)
|
261 |
+
continue
|
262 |
+
|
263 |
+
r_ = range(
|
264 |
+
0 if not bom_or_sig_available else len(sig_payload),
|
265 |
+
length,
|
266 |
+
int(length / steps),
|
267 |
+
)
|
268 |
+
|
269 |
+
multi_byte_bonus: bool = (
|
270 |
+
is_multi_byte_decoder
|
271 |
+
and decoded_payload is not None
|
272 |
+
and len(decoded_payload) < length
|
273 |
+
)
|
274 |
+
|
275 |
+
if multi_byte_bonus:
|
276 |
+
logger.log(
|
277 |
+
TRACE,
|
278 |
+
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
279 |
+
"was encoded using n-bytes.",
|
280 |
+
encoding_iana,
|
281 |
+
)
|
282 |
+
|
283 |
+
max_chunk_gave_up: int = int(len(r_) / 4)
|
284 |
+
|
285 |
+
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
286 |
+
early_stop_count: int = 0
|
287 |
+
lazy_str_hard_failure = False
|
288 |
+
|
289 |
+
md_chunks: List[str] = []
|
290 |
+
md_ratios = []
|
291 |
+
|
292 |
+
try:
|
293 |
+
for chunk in cut_sequence_chunks(
|
294 |
+
sequences,
|
295 |
+
encoding_iana,
|
296 |
+
r_,
|
297 |
+
chunk_size,
|
298 |
+
bom_or_sig_available,
|
299 |
+
strip_sig_or_bom,
|
300 |
+
sig_payload,
|
301 |
+
is_multi_byte_decoder,
|
302 |
+
decoded_payload,
|
303 |
+
):
|
304 |
+
md_chunks.append(chunk)
|
305 |
+
|
306 |
+
md_ratios.append(
|
307 |
+
mess_ratio(
|
308 |
+
chunk,
|
309 |
+
threshold,
|
310 |
+
explain is True and 1 <= len(cp_isolation) <= 2,
|
311 |
+
)
|
312 |
+
)
|
313 |
+
|
314 |
+
if md_ratios[-1] >= threshold:
|
315 |
+
early_stop_count += 1
|
316 |
+
|
317 |
+
if (early_stop_count >= max_chunk_gave_up) or (
|
318 |
+
bom_or_sig_available and strip_sig_or_bom is False
|
319 |
+
):
|
320 |
+
break
|
321 |
+
except (
|
322 |
+
UnicodeDecodeError
|
323 |
+
) as e: # Lazy str loading may have missed something there
|
324 |
+
logger.log(
|
325 |
+
TRACE,
|
326 |
+
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
327 |
+
encoding_iana,
|
328 |
+
str(e),
|
329 |
+
)
|
330 |
+
early_stop_count = max_chunk_gave_up
|
331 |
+
lazy_str_hard_failure = True
|
332 |
+
|
333 |
+
# We might want to check the sequence again with the whole content
|
334 |
+
# Only if initial MD tests passes
|
335 |
+
if (
|
336 |
+
not lazy_str_hard_failure
|
337 |
+
and is_too_large_sequence
|
338 |
+
and not is_multi_byte_decoder
|
339 |
+
):
|
340 |
+
try:
|
341 |
+
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
342 |
+
except UnicodeDecodeError as e:
|
343 |
+
logger.log(
|
344 |
+
TRACE,
|
345 |
+
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
346 |
+
encoding_iana,
|
347 |
+
str(e),
|
348 |
+
)
|
349 |
+
tested_but_hard_failure.append(encoding_iana)
|
350 |
+
continue
|
351 |
+
|
352 |
+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
353 |
+
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
354 |
+
tested_but_soft_failure.append(encoding_iana)
|
355 |
+
logger.log(
|
356 |
+
TRACE,
|
357 |
+
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
358 |
+
"Computed mean chaos is %f %%.",
|
359 |
+
encoding_iana,
|
360 |
+
early_stop_count,
|
361 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
362 |
+
)
|
363 |
+
# Preparing those fallbacks in case we got nothing.
|
364 |
+
if (
|
365 |
+
enable_fallback
|
366 |
+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
367 |
+
and not lazy_str_hard_failure
|
368 |
+
):
|
369 |
+
fallback_entry = CharsetMatch(
|
370 |
+
sequences, encoding_iana, threshold, False, [], decoded_payload
|
371 |
+
)
|
372 |
+
if encoding_iana == specified_encoding:
|
373 |
+
fallback_specified = fallback_entry
|
374 |
+
elif encoding_iana == "ascii":
|
375 |
+
fallback_ascii = fallback_entry
|
376 |
+
else:
|
377 |
+
fallback_u8 = fallback_entry
|
378 |
+
continue
|
379 |
+
|
380 |
+
logger.log(
|
381 |
+
TRACE,
|
382 |
+
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
383 |
+
encoding_iana,
|
384 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
385 |
+
)
|
386 |
+
|
387 |
+
if not is_multi_byte_decoder:
|
388 |
+
target_languages: List[str] = encoding_languages(encoding_iana)
|
389 |
+
else:
|
390 |
+
target_languages = mb_encoding_languages(encoding_iana)
|
391 |
+
|
392 |
+
if target_languages:
|
393 |
+
logger.log(
|
394 |
+
TRACE,
|
395 |
+
"{} should target any language(s) of {}".format(
|
396 |
+
encoding_iana, str(target_languages)
|
397 |
+
),
|
398 |
+
)
|
399 |
+
|
400 |
+
cd_ratios = []
|
401 |
+
|
402 |
+
# We shall skip the CD when its about ASCII
|
403 |
+
# Most of the time its not relevant to run "language-detection" on it.
|
404 |
+
if encoding_iana != "ascii":
|
405 |
+
for chunk in md_chunks:
|
406 |
+
chunk_languages = coherence_ratio(
|
407 |
+
chunk,
|
408 |
+
language_threshold,
|
409 |
+
",".join(target_languages) if target_languages else None,
|
410 |
+
)
|
411 |
+
|
412 |
+
cd_ratios.append(chunk_languages)
|
413 |
+
|
414 |
+
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
415 |
+
|
416 |
+
if cd_ratios_merged:
|
417 |
+
logger.log(
|
418 |
+
TRACE,
|
419 |
+
"We detected language {} using {}".format(
|
420 |
+
cd_ratios_merged, encoding_iana
|
421 |
+
),
|
422 |
+
)
|
423 |
+
|
424 |
+
results.append(
|
425 |
+
CharsetMatch(
|
426 |
+
sequences,
|
427 |
+
encoding_iana,
|
428 |
+
mean_mess_ratio,
|
429 |
+
bom_or_sig_available,
|
430 |
+
cd_ratios_merged,
|
431 |
+
decoded_payload,
|
432 |
+
)
|
433 |
+
)
|
434 |
+
|
435 |
+
if (
|
436 |
+
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
437 |
+
and mean_mess_ratio < 0.1
|
438 |
+
):
|
439 |
+
logger.debug(
|
440 |
+
"Encoding detection: %s is most likely the one.", encoding_iana
|
441 |
+
)
|
442 |
+
if explain:
|
443 |
+
logger.removeHandler(explain_handler)
|
444 |
+
logger.setLevel(previous_logger_level)
|
445 |
+
return CharsetMatches([results[encoding_iana]])
|
446 |
+
|
447 |
+
if encoding_iana == sig_encoding:
|
448 |
+
logger.debug(
|
449 |
+
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
450 |
+
"the beginning of the sequence.",
|
451 |
+
encoding_iana,
|
452 |
+
)
|
453 |
+
if explain:
|
454 |
+
logger.removeHandler(explain_handler)
|
455 |
+
logger.setLevel(previous_logger_level)
|
456 |
+
return CharsetMatches([results[encoding_iana]])
|
457 |
+
|
458 |
+
if len(results) == 0:
|
459 |
+
if fallback_u8 or fallback_ascii or fallback_specified:
|
460 |
+
logger.log(
|
461 |
+
TRACE,
|
462 |
+
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
463 |
+
)
|
464 |
+
|
465 |
+
if fallback_specified:
|
466 |
+
logger.debug(
|
467 |
+
"Encoding detection: %s will be used as a fallback match",
|
468 |
+
fallback_specified.encoding,
|
469 |
+
)
|
470 |
+
results.append(fallback_specified)
|
471 |
+
elif (
|
472 |
+
(fallback_u8 and fallback_ascii is None)
|
473 |
+
or (
|
474 |
+
fallback_u8
|
475 |
+
and fallback_ascii
|
476 |
+
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
477 |
+
)
|
478 |
+
or (fallback_u8 is not None)
|
479 |
+
):
|
480 |
+
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
481 |
+
results.append(fallback_u8)
|
482 |
+
elif fallback_ascii:
|
483 |
+
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
484 |
+
results.append(fallback_ascii)
|
485 |
+
|
486 |
+
if results:
|
487 |
+
logger.debug(
|
488 |
+
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
489 |
+
results.best().encoding, # type: ignore
|
490 |
+
len(results) - 1,
|
491 |
+
)
|
492 |
+
else:
|
493 |
+
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
494 |
+
|
495 |
+
if explain:
|
496 |
+
logger.removeHandler(explain_handler)
|
497 |
+
logger.setLevel(previous_logger_level)
|
498 |
+
|
499 |
+
return results
|
500 |
+
|
501 |
+
|
502 |
+
def from_fp(
|
503 |
+
fp: BinaryIO,
|
504 |
+
steps: int = 5,
|
505 |
+
chunk_size: int = 512,
|
506 |
+
threshold: float = 0.20,
|
507 |
+
cp_isolation: Optional[List[str]] = None,
|
508 |
+
cp_exclusion: Optional[List[str]] = None,
|
509 |
+
preemptive_behaviour: bool = True,
|
510 |
+
explain: bool = False,
|
511 |
+
language_threshold: float = 0.1,
|
512 |
+
enable_fallback: bool = True,
|
513 |
+
) -> CharsetMatches:
|
514 |
+
"""
|
515 |
+
Same thing than the function from_bytes but using a file pointer that is already ready.
|
516 |
+
Will not close the file pointer.
|
517 |
+
"""
|
518 |
+
return from_bytes(
|
519 |
+
fp.read(),
|
520 |
+
steps,
|
521 |
+
chunk_size,
|
522 |
+
threshold,
|
523 |
+
cp_isolation,
|
524 |
+
cp_exclusion,
|
525 |
+
preemptive_behaviour,
|
526 |
+
explain,
|
527 |
+
language_threshold,
|
528 |
+
enable_fallback,
|
529 |
+
)
|
530 |
+
|
531 |
+
|
532 |
+
def from_path(
|
533 |
+
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
|
534 |
+
steps: int = 5,
|
535 |
+
chunk_size: int = 512,
|
536 |
+
threshold: float = 0.20,
|
537 |
+
cp_isolation: Optional[List[str]] = None,
|
538 |
+
cp_exclusion: Optional[List[str]] = None,
|
539 |
+
preemptive_behaviour: bool = True,
|
540 |
+
explain: bool = False,
|
541 |
+
language_threshold: float = 0.1,
|
542 |
+
enable_fallback: bool = True,
|
543 |
+
) -> CharsetMatches:
|
544 |
+
"""
|
545 |
+
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
546 |
+
Can raise IOError.
|
547 |
+
"""
|
548 |
+
with open(path, "rb") as fp:
|
549 |
+
return from_fp(
|
550 |
+
fp,
|
551 |
+
steps,
|
552 |
+
chunk_size,
|
553 |
+
threshold,
|
554 |
+
cp_isolation,
|
555 |
+
cp_exclusion,
|
556 |
+
preemptive_behaviour,
|
557 |
+
explain,
|
558 |
+
language_threshold,
|
559 |
+
enable_fallback,
|
560 |
+
)
|
561 |
+
|
562 |
+
|
563 |
+
def is_binary(
|
564 |
+
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
|
565 |
+
steps: int = 5,
|
566 |
+
chunk_size: int = 512,
|
567 |
+
threshold: float = 0.20,
|
568 |
+
cp_isolation: Optional[List[str]] = None,
|
569 |
+
cp_exclusion: Optional[List[str]] = None,
|
570 |
+
preemptive_behaviour: bool = True,
|
571 |
+
explain: bool = False,
|
572 |
+
language_threshold: float = 0.1,
|
573 |
+
enable_fallback: bool = False,
|
574 |
+
) -> bool:
|
575 |
+
"""
|
576 |
+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
577 |
+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
578 |
+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
579 |
+
"""
|
580 |
+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
581 |
+
guesses = from_path(
|
582 |
+
fp_or_path_or_payload,
|
583 |
+
steps=steps,
|
584 |
+
chunk_size=chunk_size,
|
585 |
+
threshold=threshold,
|
586 |
+
cp_isolation=cp_isolation,
|
587 |
+
cp_exclusion=cp_exclusion,
|
588 |
+
preemptive_behaviour=preemptive_behaviour,
|
589 |
+
explain=explain,
|
590 |
+
language_threshold=language_threshold,
|
591 |
+
enable_fallback=enable_fallback,
|
592 |
+
)
|
593 |
+
elif isinstance(
|
594 |
+
fp_or_path_or_payload,
|
595 |
+
(
|
596 |
+
bytes,
|
597 |
+
bytearray,
|
598 |
+
),
|
599 |
+
):
|
600 |
+
guesses = from_bytes(
|
601 |
+
fp_or_path_or_payload,
|
602 |
+
steps=steps,
|
603 |
+
chunk_size=chunk_size,
|
604 |
+
threshold=threshold,
|
605 |
+
cp_isolation=cp_isolation,
|
606 |
+
cp_exclusion=cp_exclusion,
|
607 |
+
preemptive_behaviour=preemptive_behaviour,
|
608 |
+
explain=explain,
|
609 |
+
language_threshold=language_threshold,
|
610 |
+
enable_fallback=enable_fallback,
|
611 |
+
)
|
612 |
+
else:
|
613 |
+
guesses = from_fp(
|
614 |
+
fp_or_path_or_payload,
|
615 |
+
steps=steps,
|
616 |
+
chunk_size=chunk_size,
|
617 |
+
threshold=threshold,
|
618 |
+
cp_isolation=cp_isolation,
|
619 |
+
cp_exclusion=cp_exclusion,
|
620 |
+
preemptive_behaviour=preemptive_behaviour,
|
621 |
+
explain=explain,
|
622 |
+
language_threshold=language_threshold,
|
623 |
+
enable_fallback=enable_fallback,
|
624 |
+
)
|
625 |
+
|
626 |
+
return not guesses
|
lib/python3.11/site-packages/charset_normalizer/cd.py
ADDED
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
from codecs import IncrementalDecoder
|
3 |
+
from collections import Counter
|
4 |
+
from functools import lru_cache
|
5 |
+
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
6 |
+
|
7 |
+
from .constant import (
|
8 |
+
FREQUENCIES,
|
9 |
+
KO_NAMES,
|
10 |
+
LANGUAGE_SUPPORTED_COUNT,
|
11 |
+
TOO_SMALL_SEQUENCE,
|
12 |
+
ZH_NAMES,
|
13 |
+
)
|
14 |
+
from .md import is_suspiciously_successive_range
|
15 |
+
from .models import CoherenceMatches
|
16 |
+
from .utils import (
|
17 |
+
is_accentuated,
|
18 |
+
is_latin,
|
19 |
+
is_multi_byte_encoding,
|
20 |
+
is_unicode_range_secondary,
|
21 |
+
unicode_range,
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def encoding_unicode_range(iana_name: str) -> List[str]:
|
26 |
+
"""
|
27 |
+
Return associated unicode ranges in a single byte code page.
|
28 |
+
"""
|
29 |
+
if is_multi_byte_encoding(iana_name):
|
30 |
+
raise IOError("Function not supported on multi-byte code page")
|
31 |
+
|
32 |
+
decoder = importlib.import_module(
|
33 |
+
"encodings.{}".format(iana_name)
|
34 |
+
).IncrementalDecoder
|
35 |
+
|
36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
37 |
+
seen_ranges: Dict[str, int] = {}
|
38 |
+
character_count: int = 0
|
39 |
+
|
40 |
+
for i in range(0x40, 0xFF):
|
41 |
+
chunk: str = p.decode(bytes([i]))
|
42 |
+
|
43 |
+
if chunk:
|
44 |
+
character_range: Optional[str] = unicode_range(chunk)
|
45 |
+
|
46 |
+
if character_range is None:
|
47 |
+
continue
|
48 |
+
|
49 |
+
if is_unicode_range_secondary(character_range) is False:
|
50 |
+
if character_range not in seen_ranges:
|
51 |
+
seen_ranges[character_range] = 0
|
52 |
+
seen_ranges[character_range] += 1
|
53 |
+
character_count += 1
|
54 |
+
|
55 |
+
return sorted(
|
56 |
+
[
|
57 |
+
character_range
|
58 |
+
for character_range in seen_ranges
|
59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
60 |
+
]
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def unicode_range_languages(primary_range: str) -> List[str]:
|
65 |
+
"""
|
66 |
+
Return inferred languages used with a unicode range.
|
67 |
+
"""
|
68 |
+
languages: List[str] = []
|
69 |
+
|
70 |
+
for language, characters in FREQUENCIES.items():
|
71 |
+
for character in characters:
|
72 |
+
if unicode_range(character) == primary_range:
|
73 |
+
languages.append(language)
|
74 |
+
break
|
75 |
+
|
76 |
+
return languages
|
77 |
+
|
78 |
+
|
79 |
+
@lru_cache()
|
80 |
+
def encoding_languages(iana_name: str) -> List[str]:
|
81 |
+
"""
|
82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
83 |
+
This function does the correspondence.
|
84 |
+
"""
|
85 |
+
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
86 |
+
primary_range: Optional[str] = None
|
87 |
+
|
88 |
+
for specified_range in unicode_ranges:
|
89 |
+
if "Latin" not in specified_range:
|
90 |
+
primary_range = specified_range
|
91 |
+
break
|
92 |
+
|
93 |
+
if primary_range is None:
|
94 |
+
return ["Latin Based"]
|
95 |
+
|
96 |
+
return unicode_range_languages(primary_range)
|
97 |
+
|
98 |
+
|
99 |
+
@lru_cache()
|
100 |
+
def mb_encoding_languages(iana_name: str) -> List[str]:
|
101 |
+
"""
|
102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
103 |
+
This function does the correspondence.
|
104 |
+
"""
|
105 |
+
if (
|
106 |
+
iana_name.startswith("shift_")
|
107 |
+
or iana_name.startswith("iso2022_jp")
|
108 |
+
or iana_name.startswith("euc_j")
|
109 |
+
or iana_name == "cp932"
|
110 |
+
):
|
111 |
+
return ["Japanese"]
|
112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
113 |
+
return ["Chinese"]
|
114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
115 |
+
return ["Korean"]
|
116 |
+
|
117 |
+
return []
|
118 |
+
|
119 |
+
|
120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
121 |
+
def get_target_features(language: str) -> Tuple[bool, bool]:
|
122 |
+
"""
|
123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
124 |
+
"""
|
125 |
+
target_have_accents: bool = False
|
126 |
+
target_pure_latin: bool = True
|
127 |
+
|
128 |
+
for character in FREQUENCIES[language]:
|
129 |
+
if not target_have_accents and is_accentuated(character):
|
130 |
+
target_have_accents = True
|
131 |
+
if target_pure_latin and is_latin(character) is False:
|
132 |
+
target_pure_latin = False
|
133 |
+
|
134 |
+
return target_have_accents, target_pure_latin
|
135 |
+
|
136 |
+
|
137 |
+
def alphabet_languages(
|
138 |
+
characters: List[str], ignore_non_latin: bool = False
|
139 |
+
) -> List[str]:
|
140 |
+
"""
|
141 |
+
Return associated languages associated to given characters.
|
142 |
+
"""
|
143 |
+
languages: List[Tuple[str, float]] = []
|
144 |
+
|
145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
146 |
+
|
147 |
+
for language, language_characters in FREQUENCIES.items():
|
148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
149 |
+
|
150 |
+
if ignore_non_latin and target_pure_latin is False:
|
151 |
+
continue
|
152 |
+
|
153 |
+
if target_have_accents is False and source_have_accents:
|
154 |
+
continue
|
155 |
+
|
156 |
+
character_count: int = len(language_characters)
|
157 |
+
|
158 |
+
character_match_count: int = len(
|
159 |
+
[c for c in language_characters if c in characters]
|
160 |
+
)
|
161 |
+
|
162 |
+
ratio: float = character_match_count / character_count
|
163 |
+
|
164 |
+
if ratio >= 0.2:
|
165 |
+
languages.append((language, ratio))
|
166 |
+
|
167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
168 |
+
|
169 |
+
return [compatible_language[0] for compatible_language in languages]
|
170 |
+
|
171 |
+
|
172 |
+
def characters_popularity_compare(
|
173 |
+
language: str, ordered_characters: List[str]
|
174 |
+
) -> float:
|
175 |
+
"""
|
176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
179 |
+
"""
|
180 |
+
if language not in FREQUENCIES:
|
181 |
+
raise ValueError("{} not available".format(language))
|
182 |
+
|
183 |
+
character_approved_count: int = 0
|
184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
185 |
+
|
186 |
+
ordered_characters_count: int = len(ordered_characters)
|
187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
188 |
+
|
189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
190 |
+
|
191 |
+
for character, character_rank in zip(
|
192 |
+
ordered_characters, range(0, ordered_characters_count)
|
193 |
+
):
|
194 |
+
if character not in FREQUENCIES_language_set:
|
195 |
+
continue
|
196 |
+
|
197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
198 |
+
expected_projection_ratio: float = (
|
199 |
+
target_language_characters_count / ordered_characters_count
|
200 |
+
)
|
201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
202 |
+
|
203 |
+
if (
|
204 |
+
large_alphabet is False
|
205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
206 |
+
):
|
207 |
+
continue
|
208 |
+
|
209 |
+
if (
|
210 |
+
large_alphabet is True
|
211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
212 |
+
< target_language_characters_count / 3
|
213 |
+
):
|
214 |
+
character_approved_count += 1
|
215 |
+
continue
|
216 |
+
|
217 |
+
characters_before_source: List[str] = FREQUENCIES[language][
|
218 |
+
0:character_rank_in_language
|
219 |
+
]
|
220 |
+
characters_after_source: List[str] = FREQUENCIES[language][
|
221 |
+
character_rank_in_language:
|
222 |
+
]
|
223 |
+
characters_before: List[str] = ordered_characters[0:character_rank]
|
224 |
+
characters_after: List[str] = ordered_characters[character_rank:]
|
225 |
+
|
226 |
+
before_match_count: int = len(
|
227 |
+
set(characters_before) & set(characters_before_source)
|
228 |
+
)
|
229 |
+
|
230 |
+
after_match_count: int = len(
|
231 |
+
set(characters_after) & set(characters_after_source)
|
232 |
+
)
|
233 |
+
|
234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
235 |
+
character_approved_count += 1
|
236 |
+
continue
|
237 |
+
|
238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
239 |
+
character_approved_count += 1
|
240 |
+
continue
|
241 |
+
|
242 |
+
if (
|
243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
245 |
+
):
|
246 |
+
character_approved_count += 1
|
247 |
+
continue
|
248 |
+
|
249 |
+
return character_approved_count / len(ordered_characters)
|
250 |
+
|
251 |
+
|
252 |
+
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
253 |
+
"""
|
254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
256 |
+
One containing the latin letters and the other hebrew.
|
257 |
+
"""
|
258 |
+
layers: Dict[str, str] = {}
|
259 |
+
|
260 |
+
for character in decoded_sequence:
|
261 |
+
if character.isalpha() is False:
|
262 |
+
continue
|
263 |
+
|
264 |
+
character_range: Optional[str] = unicode_range(character)
|
265 |
+
|
266 |
+
if character_range is None:
|
267 |
+
continue
|
268 |
+
|
269 |
+
layer_target_range: Optional[str] = None
|
270 |
+
|
271 |
+
for discovered_range in layers:
|
272 |
+
if (
|
273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
274 |
+
is False
|
275 |
+
):
|
276 |
+
layer_target_range = discovered_range
|
277 |
+
break
|
278 |
+
|
279 |
+
if layer_target_range is None:
|
280 |
+
layer_target_range = character_range
|
281 |
+
|
282 |
+
if layer_target_range not in layers:
|
283 |
+
layers[layer_target_range] = character.lower()
|
284 |
+
continue
|
285 |
+
|
286 |
+
layers[layer_target_range] += character.lower()
|
287 |
+
|
288 |
+
return list(layers.values())
|
289 |
+
|
290 |
+
|
291 |
+
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
292 |
+
"""
|
293 |
+
This function merge results previously given by the function coherence_ratio.
|
294 |
+
The return type is the same as coherence_ratio.
|
295 |
+
"""
|
296 |
+
per_language_ratios: Dict[str, List[float]] = {}
|
297 |
+
for result in results:
|
298 |
+
for sub_result in result:
|
299 |
+
language, ratio = sub_result
|
300 |
+
if language not in per_language_ratios:
|
301 |
+
per_language_ratios[language] = [ratio]
|
302 |
+
continue
|
303 |
+
per_language_ratios[language].append(ratio)
|
304 |
+
|
305 |
+
merge = [
|
306 |
+
(
|
307 |
+
language,
|
308 |
+
round(
|
309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
310 |
+
4,
|
311 |
+
),
|
312 |
+
)
|
313 |
+
for language in per_language_ratios
|
314 |
+
]
|
315 |
+
|
316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
317 |
+
|
318 |
+
|
319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
320 |
+
"""
|
321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
323 |
+
"""
|
324 |
+
index_results: Dict[str, List[float]] = dict()
|
325 |
+
|
326 |
+
for result in results:
|
327 |
+
language, ratio = result
|
328 |
+
no_em_name: str = language.replace("—", "")
|
329 |
+
|
330 |
+
if no_em_name not in index_results:
|
331 |
+
index_results[no_em_name] = []
|
332 |
+
|
333 |
+
index_results[no_em_name].append(ratio)
|
334 |
+
|
335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
336 |
+
filtered_results: CoherenceMatches = []
|
337 |
+
|
338 |
+
for language in index_results:
|
339 |
+
filtered_results.append((language, max(index_results[language])))
|
340 |
+
|
341 |
+
return filtered_results
|
342 |
+
|
343 |
+
return results
|
344 |
+
|
345 |
+
|
346 |
+
@lru_cache(maxsize=2048)
|
347 |
+
def coherence_ratio(
|
348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
349 |
+
) -> CoherenceMatches:
|
350 |
+
"""
|
351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
352 |
+
A layer = Character extraction by alphabets/ranges.
|
353 |
+
"""
|
354 |
+
|
355 |
+
results: List[Tuple[str, float]] = []
|
356 |
+
ignore_non_latin: bool = False
|
357 |
+
|
358 |
+
sufficient_match_count: int = 0
|
359 |
+
|
360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
361 |
+
if "Latin Based" in lg_inclusion_list:
|
362 |
+
ignore_non_latin = True
|
363 |
+
lg_inclusion_list.remove("Latin Based")
|
364 |
+
|
365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
367 |
+
most_common = sequence_frequencies.most_common()
|
368 |
+
|
369 |
+
character_count: int = sum(o for c, o in most_common)
|
370 |
+
|
371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
372 |
+
continue
|
373 |
+
|
374 |
+
popular_character_ordered: List[str] = [c for c, o in most_common]
|
375 |
+
|
376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
377 |
+
popular_character_ordered, ignore_non_latin
|
378 |
+
):
|
379 |
+
ratio: float = characters_popularity_compare(
|
380 |
+
language, popular_character_ordered
|
381 |
+
)
|
382 |
+
|
383 |
+
if ratio < threshold:
|
384 |
+
continue
|
385 |
+
elif ratio >= 0.8:
|
386 |
+
sufficient_match_count += 1
|
387 |
+
|
388 |
+
results.append((language, round(ratio, 4)))
|
389 |
+
|
390 |
+
if sufficient_match_count >= 3:
|
391 |
+
break
|
392 |
+
|
393 |
+
return sorted(
|
394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
395 |
+
)
|
lib/python3.11/site-packages/charset_normalizer/cli/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .__main__ import cli_detect, query_yes_no
|
2 |
+
|
3 |
+
__all__ = (
|
4 |
+
"cli_detect",
|
5 |
+
"query_yes_no",
|
6 |
+
)
|
lib/python3.11/site-packages/charset_normalizer/cli/__main__.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
from json import dumps
|
4 |
+
from os.path import abspath, basename, dirname, join, realpath
|
5 |
+
from platform import python_version
|
6 |
+
from typing import List, Optional
|
7 |
+
from unicodedata import unidata_version
|
8 |
+
|
9 |
+
import charset_normalizer.md as md_module
|
10 |
+
from charset_normalizer import from_fp
|
11 |
+
from charset_normalizer.models import CliDetectionResult
|
12 |
+
from charset_normalizer.version import __version__
|
13 |
+
|
14 |
+
|
15 |
+
def query_yes_no(question: str, default: str = "yes") -> bool:
|
16 |
+
"""Ask a yes/no question via input() and return their answer.
|
17 |
+
|
18 |
+
"question" is a string that is presented to the user.
|
19 |
+
"default" is the presumed answer if the user just hits <Enter>.
|
20 |
+
It must be "yes" (the default), "no" or None (meaning
|
21 |
+
an answer is required of the user).
|
22 |
+
|
23 |
+
The "answer" return value is True for "yes" or False for "no".
|
24 |
+
|
25 |
+
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
26 |
+
"""
|
27 |
+
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
28 |
+
if default is None:
|
29 |
+
prompt = " [y/n] "
|
30 |
+
elif default == "yes":
|
31 |
+
prompt = " [Y/n] "
|
32 |
+
elif default == "no":
|
33 |
+
prompt = " [y/N] "
|
34 |
+
else:
|
35 |
+
raise ValueError("invalid default answer: '%s'" % default)
|
36 |
+
|
37 |
+
while True:
|
38 |
+
sys.stdout.write(question + prompt)
|
39 |
+
choice = input().lower()
|
40 |
+
if default is not None and choice == "":
|
41 |
+
return valid[default]
|
42 |
+
elif choice in valid:
|
43 |
+
return valid[choice]
|
44 |
+
else:
|
45 |
+
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
46 |
+
|
47 |
+
|
48 |
+
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
49 |
+
"""
|
50 |
+
CLI assistant using ARGV and ArgumentParser
|
51 |
+
:param argv:
|
52 |
+
:return: 0 if everything is fine, anything else equal trouble
|
53 |
+
"""
|
54 |
+
parser = argparse.ArgumentParser(
|
55 |
+
description="The Real First Universal Charset Detector. "
|
56 |
+
"Discover originating encoding used on text file. "
|
57 |
+
"Normalize text to unicode."
|
58 |
+
)
|
59 |
+
|
60 |
+
parser.add_argument(
|
61 |
+
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"-v",
|
65 |
+
"--verbose",
|
66 |
+
action="store_true",
|
67 |
+
default=False,
|
68 |
+
dest="verbose",
|
69 |
+
help="Display complementary information about file if any. "
|
70 |
+
"Stdout will contain logs about the detection process.",
|
71 |
+
)
|
72 |
+
parser.add_argument(
|
73 |
+
"-a",
|
74 |
+
"--with-alternative",
|
75 |
+
action="store_true",
|
76 |
+
default=False,
|
77 |
+
dest="alternatives",
|
78 |
+
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
79 |
+
)
|
80 |
+
parser.add_argument(
|
81 |
+
"-n",
|
82 |
+
"--normalize",
|
83 |
+
action="store_true",
|
84 |
+
default=False,
|
85 |
+
dest="normalize",
|
86 |
+
help="Permit to normalize input file. If not set, program does not write anything.",
|
87 |
+
)
|
88 |
+
parser.add_argument(
|
89 |
+
"-m",
|
90 |
+
"--minimal",
|
91 |
+
action="store_true",
|
92 |
+
default=False,
|
93 |
+
dest="minimal",
|
94 |
+
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
95 |
+
)
|
96 |
+
parser.add_argument(
|
97 |
+
"-r",
|
98 |
+
"--replace",
|
99 |
+
action="store_true",
|
100 |
+
default=False,
|
101 |
+
dest="replace",
|
102 |
+
help="Replace file when trying to normalize it instead of creating a new one.",
|
103 |
+
)
|
104 |
+
parser.add_argument(
|
105 |
+
"-f",
|
106 |
+
"--force",
|
107 |
+
action="store_true",
|
108 |
+
default=False,
|
109 |
+
dest="force",
|
110 |
+
help="Replace file without asking if you are sure, use this flag with caution.",
|
111 |
+
)
|
112 |
+
parser.add_argument(
|
113 |
+
"-t",
|
114 |
+
"--threshold",
|
115 |
+
action="store",
|
116 |
+
default=0.2,
|
117 |
+
type=float,
|
118 |
+
dest="threshold",
|
119 |
+
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
120 |
+
)
|
121 |
+
parser.add_argument(
|
122 |
+
"--version",
|
123 |
+
action="version",
|
124 |
+
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
125 |
+
__version__,
|
126 |
+
python_version(),
|
127 |
+
unidata_version,
|
128 |
+
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
129 |
+
),
|
130 |
+
help="Show version information and exit.",
|
131 |
+
)
|
132 |
+
|
133 |
+
args = parser.parse_args(argv)
|
134 |
+
|
135 |
+
if args.replace is True and args.normalize is False:
|
136 |
+
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
137 |
+
return 1
|
138 |
+
|
139 |
+
if args.force is True and args.replace is False:
|
140 |
+
print("Use --force in addition of --replace only.", file=sys.stderr)
|
141 |
+
return 1
|
142 |
+
|
143 |
+
if args.threshold < 0.0 or args.threshold > 1.0:
|
144 |
+
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
145 |
+
return 1
|
146 |
+
|
147 |
+
x_ = []
|
148 |
+
|
149 |
+
for my_file in args.files:
|
150 |
+
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
151 |
+
|
152 |
+
best_guess = matches.best()
|
153 |
+
|
154 |
+
if best_guess is None:
|
155 |
+
print(
|
156 |
+
'Unable to identify originating encoding for "{}". {}'.format(
|
157 |
+
my_file.name,
|
158 |
+
"Maybe try increasing maximum amount of chaos."
|
159 |
+
if args.threshold < 1.0
|
160 |
+
else "",
|
161 |
+
),
|
162 |
+
file=sys.stderr,
|
163 |
+
)
|
164 |
+
x_.append(
|
165 |
+
CliDetectionResult(
|
166 |
+
abspath(my_file.name),
|
167 |
+
None,
|
168 |
+
[],
|
169 |
+
[],
|
170 |
+
"Unknown",
|
171 |
+
[],
|
172 |
+
False,
|
173 |
+
1.0,
|
174 |
+
0.0,
|
175 |
+
None,
|
176 |
+
True,
|
177 |
+
)
|
178 |
+
)
|
179 |
+
else:
|
180 |
+
x_.append(
|
181 |
+
CliDetectionResult(
|
182 |
+
abspath(my_file.name),
|
183 |
+
best_guess.encoding,
|
184 |
+
best_guess.encoding_aliases,
|
185 |
+
[
|
186 |
+
cp
|
187 |
+
for cp in best_guess.could_be_from_charset
|
188 |
+
if cp != best_guess.encoding
|
189 |
+
],
|
190 |
+
best_guess.language,
|
191 |
+
best_guess.alphabets,
|
192 |
+
best_guess.bom,
|
193 |
+
best_guess.percent_chaos,
|
194 |
+
best_guess.percent_coherence,
|
195 |
+
None,
|
196 |
+
True,
|
197 |
+
)
|
198 |
+
)
|
199 |
+
|
200 |
+
if len(matches) > 1 and args.alternatives:
|
201 |
+
for el in matches:
|
202 |
+
if el != best_guess:
|
203 |
+
x_.append(
|
204 |
+
CliDetectionResult(
|
205 |
+
abspath(my_file.name),
|
206 |
+
el.encoding,
|
207 |
+
el.encoding_aliases,
|
208 |
+
[
|
209 |
+
cp
|
210 |
+
for cp in el.could_be_from_charset
|
211 |
+
if cp != el.encoding
|
212 |
+
],
|
213 |
+
el.language,
|
214 |
+
el.alphabets,
|
215 |
+
el.bom,
|
216 |
+
el.percent_chaos,
|
217 |
+
el.percent_coherence,
|
218 |
+
None,
|
219 |
+
False,
|
220 |
+
)
|
221 |
+
)
|
222 |
+
|
223 |
+
if args.normalize is True:
|
224 |
+
if best_guess.encoding.startswith("utf") is True:
|
225 |
+
print(
|
226 |
+
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
227 |
+
my_file.name
|
228 |
+
),
|
229 |
+
file=sys.stderr,
|
230 |
+
)
|
231 |
+
if my_file.closed is False:
|
232 |
+
my_file.close()
|
233 |
+
continue
|
234 |
+
|
235 |
+
dir_path = dirname(realpath(my_file.name))
|
236 |
+
file_name = basename(realpath(my_file.name))
|
237 |
+
|
238 |
+
o_: List[str] = file_name.split(".")
|
239 |
+
|
240 |
+
if args.replace is False:
|
241 |
+
o_.insert(-1, best_guess.encoding)
|
242 |
+
if my_file.closed is False:
|
243 |
+
my_file.close()
|
244 |
+
elif (
|
245 |
+
args.force is False
|
246 |
+
and query_yes_no(
|
247 |
+
'Are you sure to normalize "{}" by replacing it ?'.format(
|
248 |
+
my_file.name
|
249 |
+
),
|
250 |
+
"no",
|
251 |
+
)
|
252 |
+
is False
|
253 |
+
):
|
254 |
+
if my_file.closed is False:
|
255 |
+
my_file.close()
|
256 |
+
continue
|
257 |
+
|
258 |
+
try:
|
259 |
+
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
260 |
+
|
261 |
+
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
262 |
+
fp.write(str(best_guess))
|
263 |
+
except IOError as e:
|
264 |
+
print(str(e), file=sys.stderr)
|
265 |
+
if my_file.closed is False:
|
266 |
+
my_file.close()
|
267 |
+
return 2
|
268 |
+
|
269 |
+
if my_file.closed is False:
|
270 |
+
my_file.close()
|
271 |
+
|
272 |
+
if args.minimal is False:
|
273 |
+
print(
|
274 |
+
dumps(
|
275 |
+
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
276 |
+
ensure_ascii=True,
|
277 |
+
indent=4,
|
278 |
+
)
|
279 |
+
)
|
280 |
+
else:
|
281 |
+
for my_file in args.files:
|
282 |
+
print(
|
283 |
+
", ".join(
|
284 |
+
[
|
285 |
+
el.encoding or "undefined"
|
286 |
+
for el in x_
|
287 |
+
if el.path == abspath(my_file.name)
|
288 |
+
]
|
289 |
+
)
|
290 |
+
)
|
291 |
+
|
292 |
+
return 0
|
293 |
+
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
cli_detect()
|
lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (341 Bytes). View file
|
|
lib/python3.11/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc
ADDED
Binary file (11.7 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/constant.py
ADDED
@@ -0,0 +1,1995 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
3 |
+
from encodings.aliases import aliases
|
4 |
+
from re import IGNORECASE, compile as re_compile
|
5 |
+
from typing import Dict, List, Set, Union
|
6 |
+
|
7 |
+
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
8 |
+
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
9 |
+
"utf_8": BOM_UTF8,
|
10 |
+
"utf_7": [
|
11 |
+
b"\x2b\x2f\x76\x38",
|
12 |
+
b"\x2b\x2f\x76\x39",
|
13 |
+
b"\x2b\x2f\x76\x2b",
|
14 |
+
b"\x2b\x2f\x76\x2f",
|
15 |
+
b"\x2b\x2f\x76\x38\x2d",
|
16 |
+
],
|
17 |
+
"gb18030": b"\x84\x31\x95\x33",
|
18 |
+
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
19 |
+
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
20 |
+
}
|
21 |
+
|
22 |
+
TOO_SMALL_SEQUENCE: int = 32
|
23 |
+
TOO_BIG_SEQUENCE: int = int(10e6)
|
24 |
+
|
25 |
+
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
26 |
+
|
27 |
+
# Up-to-date Unicode ucd/15.0.0
|
28 |
+
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
29 |
+
"Control character": range(32),
|
30 |
+
"Basic Latin": range(32, 128),
|
31 |
+
"Latin-1 Supplement": range(128, 256),
|
32 |
+
"Latin Extended-A": range(256, 384),
|
33 |
+
"Latin Extended-B": range(384, 592),
|
34 |
+
"IPA Extensions": range(592, 688),
|
35 |
+
"Spacing Modifier Letters": range(688, 768),
|
36 |
+
"Combining Diacritical Marks": range(768, 880),
|
37 |
+
"Greek and Coptic": range(880, 1024),
|
38 |
+
"Cyrillic": range(1024, 1280),
|
39 |
+
"Cyrillic Supplement": range(1280, 1328),
|
40 |
+
"Armenian": range(1328, 1424),
|
41 |
+
"Hebrew": range(1424, 1536),
|
42 |
+
"Arabic": range(1536, 1792),
|
43 |
+
"Syriac": range(1792, 1872),
|
44 |
+
"Arabic Supplement": range(1872, 1920),
|
45 |
+
"Thaana": range(1920, 1984),
|
46 |
+
"NKo": range(1984, 2048),
|
47 |
+
"Samaritan": range(2048, 2112),
|
48 |
+
"Mandaic": range(2112, 2144),
|
49 |
+
"Syriac Supplement": range(2144, 2160),
|
50 |
+
"Arabic Extended-B": range(2160, 2208),
|
51 |
+
"Arabic Extended-A": range(2208, 2304),
|
52 |
+
"Devanagari": range(2304, 2432),
|
53 |
+
"Bengali": range(2432, 2560),
|
54 |
+
"Gurmukhi": range(2560, 2688),
|
55 |
+
"Gujarati": range(2688, 2816),
|
56 |
+
"Oriya": range(2816, 2944),
|
57 |
+
"Tamil": range(2944, 3072),
|
58 |
+
"Telugu": range(3072, 3200),
|
59 |
+
"Kannada": range(3200, 3328),
|
60 |
+
"Malayalam": range(3328, 3456),
|
61 |
+
"Sinhala": range(3456, 3584),
|
62 |
+
"Thai": range(3584, 3712),
|
63 |
+
"Lao": range(3712, 3840),
|
64 |
+
"Tibetan": range(3840, 4096),
|
65 |
+
"Myanmar": range(4096, 4256),
|
66 |
+
"Georgian": range(4256, 4352),
|
67 |
+
"Hangul Jamo": range(4352, 4608),
|
68 |
+
"Ethiopic": range(4608, 4992),
|
69 |
+
"Ethiopic Supplement": range(4992, 5024),
|
70 |
+
"Cherokee": range(5024, 5120),
|
71 |
+
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
72 |
+
"Ogham": range(5760, 5792),
|
73 |
+
"Runic": range(5792, 5888),
|
74 |
+
"Tagalog": range(5888, 5920),
|
75 |
+
"Hanunoo": range(5920, 5952),
|
76 |
+
"Buhid": range(5952, 5984),
|
77 |
+
"Tagbanwa": range(5984, 6016),
|
78 |
+
"Khmer": range(6016, 6144),
|
79 |
+
"Mongolian": range(6144, 6320),
|
80 |
+
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
81 |
+
"Limbu": range(6400, 6480),
|
82 |
+
"Tai Le": range(6480, 6528),
|
83 |
+
"New Tai Lue": range(6528, 6624),
|
84 |
+
"Khmer Symbols": range(6624, 6656),
|
85 |
+
"Buginese": range(6656, 6688),
|
86 |
+
"Tai Tham": range(6688, 6832),
|
87 |
+
"Combining Diacritical Marks Extended": range(6832, 6912),
|
88 |
+
"Balinese": range(6912, 7040),
|
89 |
+
"Sundanese": range(7040, 7104),
|
90 |
+
"Batak": range(7104, 7168),
|
91 |
+
"Lepcha": range(7168, 7248),
|
92 |
+
"Ol Chiki": range(7248, 7296),
|
93 |
+
"Cyrillic Extended-C": range(7296, 7312),
|
94 |
+
"Georgian Extended": range(7312, 7360),
|
95 |
+
"Sundanese Supplement": range(7360, 7376),
|
96 |
+
"Vedic Extensions": range(7376, 7424),
|
97 |
+
"Phonetic Extensions": range(7424, 7552),
|
98 |
+
"Phonetic Extensions Supplement": range(7552, 7616),
|
99 |
+
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
100 |
+
"Latin Extended Additional": range(7680, 7936),
|
101 |
+
"Greek Extended": range(7936, 8192),
|
102 |
+
"General Punctuation": range(8192, 8304),
|
103 |
+
"Superscripts and Subscripts": range(8304, 8352),
|
104 |
+
"Currency Symbols": range(8352, 8400),
|
105 |
+
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
106 |
+
"Letterlike Symbols": range(8448, 8528),
|
107 |
+
"Number Forms": range(8528, 8592),
|
108 |
+
"Arrows": range(8592, 8704),
|
109 |
+
"Mathematical Operators": range(8704, 8960),
|
110 |
+
"Miscellaneous Technical": range(8960, 9216),
|
111 |
+
"Control Pictures": range(9216, 9280),
|
112 |
+
"Optical Character Recognition": range(9280, 9312),
|
113 |
+
"Enclosed Alphanumerics": range(9312, 9472),
|
114 |
+
"Box Drawing": range(9472, 9600),
|
115 |
+
"Block Elements": range(9600, 9632),
|
116 |
+
"Geometric Shapes": range(9632, 9728),
|
117 |
+
"Miscellaneous Symbols": range(9728, 9984),
|
118 |
+
"Dingbats": range(9984, 10176),
|
119 |
+
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
120 |
+
"Supplemental Arrows-A": range(10224, 10240),
|
121 |
+
"Braille Patterns": range(10240, 10496),
|
122 |
+
"Supplemental Arrows-B": range(10496, 10624),
|
123 |
+
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
124 |
+
"Supplemental Mathematical Operators": range(10752, 11008),
|
125 |
+
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
126 |
+
"Glagolitic": range(11264, 11360),
|
127 |
+
"Latin Extended-C": range(11360, 11392),
|
128 |
+
"Coptic": range(11392, 11520),
|
129 |
+
"Georgian Supplement": range(11520, 11568),
|
130 |
+
"Tifinagh": range(11568, 11648),
|
131 |
+
"Ethiopic Extended": range(11648, 11744),
|
132 |
+
"Cyrillic Extended-A": range(11744, 11776),
|
133 |
+
"Supplemental Punctuation": range(11776, 11904),
|
134 |
+
"CJK Radicals Supplement": range(11904, 12032),
|
135 |
+
"Kangxi Radicals": range(12032, 12256),
|
136 |
+
"Ideographic Description Characters": range(12272, 12288),
|
137 |
+
"CJK Symbols and Punctuation": range(12288, 12352),
|
138 |
+
"Hiragana": range(12352, 12448),
|
139 |
+
"Katakana": range(12448, 12544),
|
140 |
+
"Bopomofo": range(12544, 12592),
|
141 |
+
"Hangul Compatibility Jamo": range(12592, 12688),
|
142 |
+
"Kanbun": range(12688, 12704),
|
143 |
+
"Bopomofo Extended": range(12704, 12736),
|
144 |
+
"CJK Strokes": range(12736, 12784),
|
145 |
+
"Katakana Phonetic Extensions": range(12784, 12800),
|
146 |
+
"Enclosed CJK Letters and Months": range(12800, 13056),
|
147 |
+
"CJK Compatibility": range(13056, 13312),
|
148 |
+
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
149 |
+
"Yijing Hexagram Symbols": range(19904, 19968),
|
150 |
+
"CJK Unified Ideographs": range(19968, 40960),
|
151 |
+
"Yi Syllables": range(40960, 42128),
|
152 |
+
"Yi Radicals": range(42128, 42192),
|
153 |
+
"Lisu": range(42192, 42240),
|
154 |
+
"Vai": range(42240, 42560),
|
155 |
+
"Cyrillic Extended-B": range(42560, 42656),
|
156 |
+
"Bamum": range(42656, 42752),
|
157 |
+
"Modifier Tone Letters": range(42752, 42784),
|
158 |
+
"Latin Extended-D": range(42784, 43008),
|
159 |
+
"Syloti Nagri": range(43008, 43056),
|
160 |
+
"Common Indic Number Forms": range(43056, 43072),
|
161 |
+
"Phags-pa": range(43072, 43136),
|
162 |
+
"Saurashtra": range(43136, 43232),
|
163 |
+
"Devanagari Extended": range(43232, 43264),
|
164 |
+
"Kayah Li": range(43264, 43312),
|
165 |
+
"Rejang": range(43312, 43360),
|
166 |
+
"Hangul Jamo Extended-A": range(43360, 43392),
|
167 |
+
"Javanese": range(43392, 43488),
|
168 |
+
"Myanmar Extended-B": range(43488, 43520),
|
169 |
+
"Cham": range(43520, 43616),
|
170 |
+
"Myanmar Extended-A": range(43616, 43648),
|
171 |
+
"Tai Viet": range(43648, 43744),
|
172 |
+
"Meetei Mayek Extensions": range(43744, 43776),
|
173 |
+
"Ethiopic Extended-A": range(43776, 43824),
|
174 |
+
"Latin Extended-E": range(43824, 43888),
|
175 |
+
"Cherokee Supplement": range(43888, 43968),
|
176 |
+
"Meetei Mayek": range(43968, 44032),
|
177 |
+
"Hangul Syllables": range(44032, 55216),
|
178 |
+
"Hangul Jamo Extended-B": range(55216, 55296),
|
179 |
+
"High Surrogates": range(55296, 56192),
|
180 |
+
"High Private Use Surrogates": range(56192, 56320),
|
181 |
+
"Low Surrogates": range(56320, 57344),
|
182 |
+
"Private Use Area": range(57344, 63744),
|
183 |
+
"CJK Compatibility Ideographs": range(63744, 64256),
|
184 |
+
"Alphabetic Presentation Forms": range(64256, 64336),
|
185 |
+
"Arabic Presentation Forms-A": range(64336, 65024),
|
186 |
+
"Variation Selectors": range(65024, 65040),
|
187 |
+
"Vertical Forms": range(65040, 65056),
|
188 |
+
"Combining Half Marks": range(65056, 65072),
|
189 |
+
"CJK Compatibility Forms": range(65072, 65104),
|
190 |
+
"Small Form Variants": range(65104, 65136),
|
191 |
+
"Arabic Presentation Forms-B": range(65136, 65280),
|
192 |
+
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
193 |
+
"Specials": range(65520, 65536),
|
194 |
+
"Linear B Syllabary": range(65536, 65664),
|
195 |
+
"Linear B Ideograms": range(65664, 65792),
|
196 |
+
"Aegean Numbers": range(65792, 65856),
|
197 |
+
"Ancient Greek Numbers": range(65856, 65936),
|
198 |
+
"Ancient Symbols": range(65936, 66000),
|
199 |
+
"Phaistos Disc": range(66000, 66048),
|
200 |
+
"Lycian": range(66176, 66208),
|
201 |
+
"Carian": range(66208, 66272),
|
202 |
+
"Coptic Epact Numbers": range(66272, 66304),
|
203 |
+
"Old Italic": range(66304, 66352),
|
204 |
+
"Gothic": range(66352, 66384),
|
205 |
+
"Old Permic": range(66384, 66432),
|
206 |
+
"Ugaritic": range(66432, 66464),
|
207 |
+
"Old Persian": range(66464, 66528),
|
208 |
+
"Deseret": range(66560, 66640),
|
209 |
+
"Shavian": range(66640, 66688),
|
210 |
+
"Osmanya": range(66688, 66736),
|
211 |
+
"Osage": range(66736, 66816),
|
212 |
+
"Elbasan": range(66816, 66864),
|
213 |
+
"Caucasian Albanian": range(66864, 66928),
|
214 |
+
"Vithkuqi": range(66928, 67008),
|
215 |
+
"Linear A": range(67072, 67456),
|
216 |
+
"Latin Extended-F": range(67456, 67520),
|
217 |
+
"Cypriot Syllabary": range(67584, 67648),
|
218 |
+
"Imperial Aramaic": range(67648, 67680),
|
219 |
+
"Palmyrene": range(67680, 67712),
|
220 |
+
"Nabataean": range(67712, 67760),
|
221 |
+
"Hatran": range(67808, 67840),
|
222 |
+
"Phoenician": range(67840, 67872),
|
223 |
+
"Lydian": range(67872, 67904),
|
224 |
+
"Meroitic Hieroglyphs": range(67968, 68000),
|
225 |
+
"Meroitic Cursive": range(68000, 68096),
|
226 |
+
"Kharoshthi": range(68096, 68192),
|
227 |
+
"Old South Arabian": range(68192, 68224),
|
228 |
+
"Old North Arabian": range(68224, 68256),
|
229 |
+
"Manichaean": range(68288, 68352),
|
230 |
+
"Avestan": range(68352, 68416),
|
231 |
+
"Inscriptional Parthian": range(68416, 68448),
|
232 |
+
"Inscriptional Pahlavi": range(68448, 68480),
|
233 |
+
"Psalter Pahlavi": range(68480, 68528),
|
234 |
+
"Old Turkic": range(68608, 68688),
|
235 |
+
"Old Hungarian": range(68736, 68864),
|
236 |
+
"Hanifi Rohingya": range(68864, 68928),
|
237 |
+
"Rumi Numeral Symbols": range(69216, 69248),
|
238 |
+
"Yezidi": range(69248, 69312),
|
239 |
+
"Arabic Extended-C": range(69312, 69376),
|
240 |
+
"Old Sogdian": range(69376, 69424),
|
241 |
+
"Sogdian": range(69424, 69488),
|
242 |
+
"Old Uyghur": range(69488, 69552),
|
243 |
+
"Chorasmian": range(69552, 69600),
|
244 |
+
"Elymaic": range(69600, 69632),
|
245 |
+
"Brahmi": range(69632, 69760),
|
246 |
+
"Kaithi": range(69760, 69840),
|
247 |
+
"Sora Sompeng": range(69840, 69888),
|
248 |
+
"Chakma": range(69888, 69968),
|
249 |
+
"Mahajani": range(69968, 70016),
|
250 |
+
"Sharada": range(70016, 70112),
|
251 |
+
"Sinhala Archaic Numbers": range(70112, 70144),
|
252 |
+
"Khojki": range(70144, 70224),
|
253 |
+
"Multani": range(70272, 70320),
|
254 |
+
"Khudawadi": range(70320, 70400),
|
255 |
+
"Grantha": range(70400, 70528),
|
256 |
+
"Newa": range(70656, 70784),
|
257 |
+
"Tirhuta": range(70784, 70880),
|
258 |
+
"Siddham": range(71040, 71168),
|
259 |
+
"Modi": range(71168, 71264),
|
260 |
+
"Mongolian Supplement": range(71264, 71296),
|
261 |
+
"Takri": range(71296, 71376),
|
262 |
+
"Ahom": range(71424, 71504),
|
263 |
+
"Dogra": range(71680, 71760),
|
264 |
+
"Warang Citi": range(71840, 71936),
|
265 |
+
"Dives Akuru": range(71936, 72032),
|
266 |
+
"Nandinagari": range(72096, 72192),
|
267 |
+
"Zanabazar Square": range(72192, 72272),
|
268 |
+
"Soyombo": range(72272, 72368),
|
269 |
+
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
270 |
+
"Pau Cin Hau": range(72384, 72448),
|
271 |
+
"Devanagari Extended-A": range(72448, 72544),
|
272 |
+
"Bhaiksuki": range(72704, 72816),
|
273 |
+
"Marchen": range(72816, 72896),
|
274 |
+
"Masaram Gondi": range(72960, 73056),
|
275 |
+
"Gunjala Gondi": range(73056, 73136),
|
276 |
+
"Makasar": range(73440, 73472),
|
277 |
+
"Kawi": range(73472, 73568),
|
278 |
+
"Lisu Supplement": range(73648, 73664),
|
279 |
+
"Tamil Supplement": range(73664, 73728),
|
280 |
+
"Cuneiform": range(73728, 74752),
|
281 |
+
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
282 |
+
"Early Dynastic Cuneiform": range(74880, 75088),
|
283 |
+
"Cypro-Minoan": range(77712, 77824),
|
284 |
+
"Egyptian Hieroglyphs": range(77824, 78896),
|
285 |
+
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
286 |
+
"Anatolian Hieroglyphs": range(82944, 83584),
|
287 |
+
"Bamum Supplement": range(92160, 92736),
|
288 |
+
"Mro": range(92736, 92784),
|
289 |
+
"Tangsa": range(92784, 92880),
|
290 |
+
"Bassa Vah": range(92880, 92928),
|
291 |
+
"Pahawh Hmong": range(92928, 93072),
|
292 |
+
"Medefaidrin": range(93760, 93856),
|
293 |
+
"Miao": range(93952, 94112),
|
294 |
+
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
295 |
+
"Tangut": range(94208, 100352),
|
296 |
+
"Tangut Components": range(100352, 101120),
|
297 |
+
"Khitan Small Script": range(101120, 101632),
|
298 |
+
"Tangut Supplement": range(101632, 101760),
|
299 |
+
"Kana Extended-B": range(110576, 110592),
|
300 |
+
"Kana Supplement": range(110592, 110848),
|
301 |
+
"Kana Extended-A": range(110848, 110896),
|
302 |
+
"Small Kana Extension": range(110896, 110960),
|
303 |
+
"Nushu": range(110960, 111360),
|
304 |
+
"Duployan": range(113664, 113824),
|
305 |
+
"Shorthand Format Controls": range(113824, 113840),
|
306 |
+
"Znamenny Musical Notation": range(118528, 118736),
|
307 |
+
"Byzantine Musical Symbols": range(118784, 119040),
|
308 |
+
"Musical Symbols": range(119040, 119296),
|
309 |
+
"Ancient Greek Musical Notation": range(119296, 119376),
|
310 |
+
"Kaktovik Numerals": range(119488, 119520),
|
311 |
+
"Mayan Numerals": range(119520, 119552),
|
312 |
+
"Tai Xuan Jing Symbols": range(119552, 119648),
|
313 |
+
"Counting Rod Numerals": range(119648, 119680),
|
314 |
+
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
315 |
+
"Sutton SignWriting": range(120832, 121520),
|
316 |
+
"Latin Extended-G": range(122624, 122880),
|
317 |
+
"Glagolitic Supplement": range(122880, 122928),
|
318 |
+
"Cyrillic Extended-D": range(122928, 123024),
|
319 |
+
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
320 |
+
"Toto": range(123536, 123584),
|
321 |
+
"Wancho": range(123584, 123648),
|
322 |
+
"Nag Mundari": range(124112, 124160),
|
323 |
+
"Ethiopic Extended-B": range(124896, 124928),
|
324 |
+
"Mende Kikakui": range(124928, 125152),
|
325 |
+
"Adlam": range(125184, 125280),
|
326 |
+
"Indic Siyaq Numbers": range(126064, 126144),
|
327 |
+
"Ottoman Siyaq Numbers": range(126208, 126288),
|
328 |
+
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
329 |
+
"Mahjong Tiles": range(126976, 127024),
|
330 |
+
"Domino Tiles": range(127024, 127136),
|
331 |
+
"Playing Cards": range(127136, 127232),
|
332 |
+
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
333 |
+
"Enclosed Ideographic Supplement": range(127488, 127744),
|
334 |
+
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
335 |
+
"Emoticons range(Emoji)": range(128512, 128592),
|
336 |
+
"Ornamental Dingbats": range(128592, 128640),
|
337 |
+
"Transport and Map Symbols": range(128640, 128768),
|
338 |
+
"Alchemical Symbols": range(128768, 128896),
|
339 |
+
"Geometric Shapes Extended": range(128896, 129024),
|
340 |
+
"Supplemental Arrows-C": range(129024, 129280),
|
341 |
+
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
342 |
+
"Chess Symbols": range(129536, 129648),
|
343 |
+
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
344 |
+
"Symbols for Legacy Computing": range(129792, 130048),
|
345 |
+
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
346 |
+
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
347 |
+
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
348 |
+
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
349 |
+
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
350 |
+
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
351 |
+
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
352 |
+
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
353 |
+
"Tags": range(917504, 917632),
|
354 |
+
"Variation Selectors Supplement": range(917760, 918000),
|
355 |
+
"Supplementary Private Use Area-A": range(983040, 1048576),
|
356 |
+
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
357 |
+
}
|
358 |
+
|
359 |
+
|
360 |
+
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
361 |
+
"Supplement",
|
362 |
+
"Extended",
|
363 |
+
"Extensions",
|
364 |
+
"Modifier",
|
365 |
+
"Marks",
|
366 |
+
"Punctuation",
|
367 |
+
"Symbols",
|
368 |
+
"Forms",
|
369 |
+
"Operators",
|
370 |
+
"Miscellaneous",
|
371 |
+
"Drawing",
|
372 |
+
"Block",
|
373 |
+
"Shapes",
|
374 |
+
"Supplemental",
|
375 |
+
"Tags",
|
376 |
+
]
|
377 |
+
|
378 |
+
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
379 |
+
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
380 |
+
IGNORECASE,
|
381 |
+
)
|
382 |
+
|
383 |
+
IANA_NO_ALIASES = [
|
384 |
+
"cp720",
|
385 |
+
"cp737",
|
386 |
+
"cp856",
|
387 |
+
"cp874",
|
388 |
+
"cp875",
|
389 |
+
"cp1006",
|
390 |
+
"koi8_r",
|
391 |
+
"koi8_t",
|
392 |
+
"koi8_u",
|
393 |
+
]
|
394 |
+
|
395 |
+
IANA_SUPPORTED: List[str] = sorted(
|
396 |
+
filter(
|
397 |
+
lambda x: x.endswith("_codec") is False
|
398 |
+
and x not in {"rot_13", "tactis", "mbcs"},
|
399 |
+
list(set(aliases.values())) + IANA_NO_ALIASES,
|
400 |
+
)
|
401 |
+
)
|
402 |
+
|
403 |
+
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
404 |
+
|
405 |
+
# pre-computed code page that are similar using the function cp_similarity.
|
406 |
+
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
407 |
+
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
408 |
+
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
409 |
+
"cp1125": ["cp866"],
|
410 |
+
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
411 |
+
"cp1250": ["iso8859_2"],
|
412 |
+
"cp1251": ["kz1048", "ptcp154"],
|
413 |
+
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
414 |
+
"cp1253": ["iso8859_7"],
|
415 |
+
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
416 |
+
"cp1257": ["iso8859_13"],
|
417 |
+
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
418 |
+
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
419 |
+
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
420 |
+
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
421 |
+
"cp857": ["cp850", "cp858", "cp865"],
|
422 |
+
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
423 |
+
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
424 |
+
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
425 |
+
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
426 |
+
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
427 |
+
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
428 |
+
"cp866": ["cp1125"],
|
429 |
+
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
430 |
+
"iso8859_11": ["tis_620"],
|
431 |
+
"iso8859_13": ["cp1257"],
|
432 |
+
"iso8859_14": [
|
433 |
+
"iso8859_10",
|
434 |
+
"iso8859_15",
|
435 |
+
"iso8859_16",
|
436 |
+
"iso8859_3",
|
437 |
+
"iso8859_9",
|
438 |
+
"latin_1",
|
439 |
+
],
|
440 |
+
"iso8859_15": [
|
441 |
+
"cp1252",
|
442 |
+
"cp1254",
|
443 |
+
"iso8859_10",
|
444 |
+
"iso8859_14",
|
445 |
+
"iso8859_16",
|
446 |
+
"iso8859_3",
|
447 |
+
"iso8859_9",
|
448 |
+
"latin_1",
|
449 |
+
],
|
450 |
+
"iso8859_16": [
|
451 |
+
"iso8859_14",
|
452 |
+
"iso8859_15",
|
453 |
+
"iso8859_2",
|
454 |
+
"iso8859_3",
|
455 |
+
"iso8859_9",
|
456 |
+
"latin_1",
|
457 |
+
],
|
458 |
+
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
459 |
+
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
460 |
+
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
461 |
+
"iso8859_7": ["cp1253"],
|
462 |
+
"iso8859_9": [
|
463 |
+
"cp1252",
|
464 |
+
"cp1254",
|
465 |
+
"cp1258",
|
466 |
+
"iso8859_10",
|
467 |
+
"iso8859_14",
|
468 |
+
"iso8859_15",
|
469 |
+
"iso8859_16",
|
470 |
+
"iso8859_3",
|
471 |
+
"iso8859_4",
|
472 |
+
"latin_1",
|
473 |
+
],
|
474 |
+
"kz1048": ["cp1251", "ptcp154"],
|
475 |
+
"latin_1": [
|
476 |
+
"cp1252",
|
477 |
+
"cp1254",
|
478 |
+
"cp1258",
|
479 |
+
"iso8859_10",
|
480 |
+
"iso8859_14",
|
481 |
+
"iso8859_15",
|
482 |
+
"iso8859_16",
|
483 |
+
"iso8859_3",
|
484 |
+
"iso8859_4",
|
485 |
+
"iso8859_9",
|
486 |
+
],
|
487 |
+
"mac_iceland": ["mac_roman", "mac_turkish"],
|
488 |
+
"mac_roman": ["mac_iceland", "mac_turkish"],
|
489 |
+
"mac_turkish": ["mac_iceland", "mac_roman"],
|
490 |
+
"ptcp154": ["cp1251", "kz1048"],
|
491 |
+
"tis_620": ["iso8859_11"],
|
492 |
+
}
|
493 |
+
|
494 |
+
|
495 |
+
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
496 |
+
"iso2022_kr": "ISO-2022-KR",
|
497 |
+
"iso2022_jp": "ISO-2022-JP",
|
498 |
+
"euc_kr": "EUC-KR",
|
499 |
+
"tis_620": "TIS-620",
|
500 |
+
"utf_32": "UTF-32",
|
501 |
+
"euc_jp": "EUC-JP",
|
502 |
+
"koi8_r": "KOI8-R",
|
503 |
+
"iso8859_1": "ISO-8859-1",
|
504 |
+
"iso8859_2": "ISO-8859-2",
|
505 |
+
"iso8859_5": "ISO-8859-5",
|
506 |
+
"iso8859_6": "ISO-8859-6",
|
507 |
+
"iso8859_7": "ISO-8859-7",
|
508 |
+
"iso8859_8": "ISO-8859-8",
|
509 |
+
"utf_16": "UTF-16",
|
510 |
+
"cp855": "IBM855",
|
511 |
+
"mac_cyrillic": "MacCyrillic",
|
512 |
+
"gb2312": "GB2312",
|
513 |
+
"gb18030": "GB18030",
|
514 |
+
"cp932": "CP932",
|
515 |
+
"cp866": "IBM866",
|
516 |
+
"utf_8": "utf-8",
|
517 |
+
"utf_8_sig": "UTF-8-SIG",
|
518 |
+
"shift_jis": "SHIFT_JIS",
|
519 |
+
"big5": "Big5",
|
520 |
+
"cp1250": "windows-1250",
|
521 |
+
"cp1251": "windows-1251",
|
522 |
+
"cp1252": "Windows-1252",
|
523 |
+
"cp1253": "windows-1253",
|
524 |
+
"cp1255": "windows-1255",
|
525 |
+
"cp1256": "windows-1256",
|
526 |
+
"cp1254": "Windows-1254",
|
527 |
+
"cp949": "CP949",
|
528 |
+
}
|
529 |
+
|
530 |
+
|
531 |
+
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
532 |
+
"<",
|
533 |
+
">",
|
534 |
+
"=",
|
535 |
+
":",
|
536 |
+
"/",
|
537 |
+
"&",
|
538 |
+
";",
|
539 |
+
"{",
|
540 |
+
"}",
|
541 |
+
"[",
|
542 |
+
"]",
|
543 |
+
",",
|
544 |
+
"|",
|
545 |
+
'"',
|
546 |
+
"-",
|
547 |
+
}
|
548 |
+
|
549 |
+
|
550 |
+
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
551 |
+
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
552 |
+
|
553 |
+
# Logging LEVEL below DEBUG
|
554 |
+
TRACE: int = 5
|
555 |
+
|
556 |
+
|
557 |
+
# Language label that contain the em dash "—"
|
558 |
+
# character are to be considered alternative seq to origin
|
559 |
+
FREQUENCIES: Dict[str, List[str]] = {
|
560 |
+
"English": [
|
561 |
+
"e",
|
562 |
+
"a",
|
563 |
+
"t",
|
564 |
+
"i",
|
565 |
+
"o",
|
566 |
+
"n",
|
567 |
+
"s",
|
568 |
+
"r",
|
569 |
+
"h",
|
570 |
+
"l",
|
571 |
+
"d",
|
572 |
+
"c",
|
573 |
+
"u",
|
574 |
+
"m",
|
575 |
+
"f",
|
576 |
+
"p",
|
577 |
+
"g",
|
578 |
+
"w",
|
579 |
+
"y",
|
580 |
+
"b",
|
581 |
+
"v",
|
582 |
+
"k",
|
583 |
+
"x",
|
584 |
+
"j",
|
585 |
+
"z",
|
586 |
+
"q",
|
587 |
+
],
|
588 |
+
"English—": [
|
589 |
+
"e",
|
590 |
+
"a",
|
591 |
+
"t",
|
592 |
+
"i",
|
593 |
+
"o",
|
594 |
+
"n",
|
595 |
+
"s",
|
596 |
+
"r",
|
597 |
+
"h",
|
598 |
+
"l",
|
599 |
+
"d",
|
600 |
+
"c",
|
601 |
+
"m",
|
602 |
+
"u",
|
603 |
+
"f",
|
604 |
+
"p",
|
605 |
+
"g",
|
606 |
+
"w",
|
607 |
+
"b",
|
608 |
+
"y",
|
609 |
+
"v",
|
610 |
+
"k",
|
611 |
+
"j",
|
612 |
+
"x",
|
613 |
+
"z",
|
614 |
+
"q",
|
615 |
+
],
|
616 |
+
"German": [
|
617 |
+
"e",
|
618 |
+
"n",
|
619 |
+
"i",
|
620 |
+
"r",
|
621 |
+
"s",
|
622 |
+
"t",
|
623 |
+
"a",
|
624 |
+
"d",
|
625 |
+
"h",
|
626 |
+
"u",
|
627 |
+
"l",
|
628 |
+
"g",
|
629 |
+
"o",
|
630 |
+
"c",
|
631 |
+
"m",
|
632 |
+
"b",
|
633 |
+
"f",
|
634 |
+
"k",
|
635 |
+
"w",
|
636 |
+
"z",
|
637 |
+
"p",
|
638 |
+
"v",
|
639 |
+
"ü",
|
640 |
+
"ä",
|
641 |
+
"ö",
|
642 |
+
"j",
|
643 |
+
],
|
644 |
+
"French": [
|
645 |
+
"e",
|
646 |
+
"a",
|
647 |
+
"s",
|
648 |
+
"n",
|
649 |
+
"i",
|
650 |
+
"t",
|
651 |
+
"r",
|
652 |
+
"l",
|
653 |
+
"u",
|
654 |
+
"o",
|
655 |
+
"d",
|
656 |
+
"c",
|
657 |
+
"p",
|
658 |
+
"m",
|
659 |
+
"é",
|
660 |
+
"v",
|
661 |
+
"g",
|
662 |
+
"f",
|
663 |
+
"b",
|
664 |
+
"h",
|
665 |
+
"q",
|
666 |
+
"à",
|
667 |
+
"x",
|
668 |
+
"è",
|
669 |
+
"y",
|
670 |
+
"j",
|
671 |
+
],
|
672 |
+
"Dutch": [
|
673 |
+
"e",
|
674 |
+
"n",
|
675 |
+
"a",
|
676 |
+
"i",
|
677 |
+
"r",
|
678 |
+
"t",
|
679 |
+
"o",
|
680 |
+
"d",
|
681 |
+
"s",
|
682 |
+
"l",
|
683 |
+
"g",
|
684 |
+
"h",
|
685 |
+
"v",
|
686 |
+
"m",
|
687 |
+
"u",
|
688 |
+
"k",
|
689 |
+
"c",
|
690 |
+
"p",
|
691 |
+
"b",
|
692 |
+
"w",
|
693 |
+
"j",
|
694 |
+
"z",
|
695 |
+
"f",
|
696 |
+
"y",
|
697 |
+
"x",
|
698 |
+
"ë",
|
699 |
+
],
|
700 |
+
"Italian": [
|
701 |
+
"e",
|
702 |
+
"i",
|
703 |
+
"a",
|
704 |
+
"o",
|
705 |
+
"n",
|
706 |
+
"l",
|
707 |
+
"t",
|
708 |
+
"r",
|
709 |
+
"s",
|
710 |
+
"c",
|
711 |
+
"d",
|
712 |
+
"u",
|
713 |
+
"p",
|
714 |
+
"m",
|
715 |
+
"g",
|
716 |
+
"v",
|
717 |
+
"f",
|
718 |
+
"b",
|
719 |
+
"z",
|
720 |
+
"h",
|
721 |
+
"q",
|
722 |
+
"è",
|
723 |
+
"à",
|
724 |
+
"k",
|
725 |
+
"y",
|
726 |
+
"ò",
|
727 |
+
],
|
728 |
+
"Polish": [
|
729 |
+
"a",
|
730 |
+
"i",
|
731 |
+
"o",
|
732 |
+
"e",
|
733 |
+
"n",
|
734 |
+
"r",
|
735 |
+
"z",
|
736 |
+
"w",
|
737 |
+
"s",
|
738 |
+
"c",
|
739 |
+
"t",
|
740 |
+
"k",
|
741 |
+
"y",
|
742 |
+
"d",
|
743 |
+
"p",
|
744 |
+
"m",
|
745 |
+
"u",
|
746 |
+
"l",
|
747 |
+
"j",
|
748 |
+
"ł",
|
749 |
+
"g",
|
750 |
+
"b",
|
751 |
+
"h",
|
752 |
+
"ą",
|
753 |
+
"ę",
|
754 |
+
"ó",
|
755 |
+
],
|
756 |
+
"Spanish": [
|
757 |
+
"e",
|
758 |
+
"a",
|
759 |
+
"o",
|
760 |
+
"n",
|
761 |
+
"s",
|
762 |
+
"r",
|
763 |
+
"i",
|
764 |
+
"l",
|
765 |
+
"d",
|
766 |
+
"t",
|
767 |
+
"c",
|
768 |
+
"u",
|
769 |
+
"m",
|
770 |
+
"p",
|
771 |
+
"b",
|
772 |
+
"g",
|
773 |
+
"v",
|
774 |
+
"f",
|
775 |
+
"y",
|
776 |
+
"ó",
|
777 |
+
"h",
|
778 |
+
"q",
|
779 |
+
"í",
|
780 |
+
"j",
|
781 |
+
"z",
|
782 |
+
"á",
|
783 |
+
],
|
784 |
+
"Russian": [
|
785 |
+
"о",
|
786 |
+
"а",
|
787 |
+
"е",
|
788 |
+
"и",
|
789 |
+
"н",
|
790 |
+
"с",
|
791 |
+
"т",
|
792 |
+
"р",
|
793 |
+
"в",
|
794 |
+
"л",
|
795 |
+
"к",
|
796 |
+
"м",
|
797 |
+
"д",
|
798 |
+
"п",
|
799 |
+
"у",
|
800 |
+
"г",
|
801 |
+
"я",
|
802 |
+
"ы",
|
803 |
+
"з",
|
804 |
+
"б",
|
805 |
+
"й",
|
806 |
+
"ь",
|
807 |
+
"ч",
|
808 |
+
"х",
|
809 |
+
"ж",
|
810 |
+
"ц",
|
811 |
+
],
|
812 |
+
# Jap-Kanji
|
813 |
+
"Japanese": [
|
814 |
+
"人",
|
815 |
+
"一",
|
816 |
+
"大",
|
817 |
+
"亅",
|
818 |
+
"丁",
|
819 |
+
"丨",
|
820 |
+
"竹",
|
821 |
+
"笑",
|
822 |
+
"口",
|
823 |
+
"日",
|
824 |
+
"今",
|
825 |
+
"二",
|
826 |
+
"彳",
|
827 |
+
"行",
|
828 |
+
"十",
|
829 |
+
"土",
|
830 |
+
"丶",
|
831 |
+
"寸",
|
832 |
+
"寺",
|
833 |
+
"時",
|
834 |
+
"乙",
|
835 |
+
"丿",
|
836 |
+
"乂",
|
837 |
+
"气",
|
838 |
+
"気",
|
839 |
+
"冂",
|
840 |
+
"巾",
|
841 |
+
"亠",
|
842 |
+
"市",
|
843 |
+
"目",
|
844 |
+
"儿",
|
845 |
+
"見",
|
846 |
+
"八",
|
847 |
+
"小",
|
848 |
+
"凵",
|
849 |
+
"県",
|
850 |
+
"月",
|
851 |
+
"彐",
|
852 |
+
"門",
|
853 |
+
"間",
|
854 |
+
"木",
|
855 |
+
"東",
|
856 |
+
"山",
|
857 |
+
"出",
|
858 |
+
"本",
|
859 |
+
"中",
|
860 |
+
"刀",
|
861 |
+
"分",
|
862 |
+
"耳",
|
863 |
+
"又",
|
864 |
+
"取",
|
865 |
+
"最",
|
866 |
+
"言",
|
867 |
+
"田",
|
868 |
+
"心",
|
869 |
+
"思",
|
870 |
+
"刂",
|
871 |
+
"前",
|
872 |
+
"京",
|
873 |
+
"尹",
|
874 |
+
"事",
|
875 |
+
"生",
|
876 |
+
"厶",
|
877 |
+
"云",
|
878 |
+
"会",
|
879 |
+
"未",
|
880 |
+
"来",
|
881 |
+
"白",
|
882 |
+
"冫",
|
883 |
+
"楽",
|
884 |
+
"灬",
|
885 |
+
"馬",
|
886 |
+
"尸",
|
887 |
+
"尺",
|
888 |
+
"駅",
|
889 |
+
"明",
|
890 |
+
"耂",
|
891 |
+
"者",
|
892 |
+
"了",
|
893 |
+
"阝",
|
894 |
+
"都",
|
895 |
+
"高",
|
896 |
+
"卜",
|
897 |
+
"占",
|
898 |
+
"厂",
|
899 |
+
"广",
|
900 |
+
"店",
|
901 |
+
"子",
|
902 |
+
"申",
|
903 |
+
"奄",
|
904 |
+
"亻",
|
905 |
+
"俺",
|
906 |
+
"上",
|
907 |
+
"方",
|
908 |
+
"冖",
|
909 |
+
"学",
|
910 |
+
"衣",
|
911 |
+
"艮",
|
912 |
+
"食",
|
913 |
+
"自",
|
914 |
+
],
|
915 |
+
# Jap-Katakana
|
916 |
+
"Japanese—": [
|
917 |
+
"ー",
|
918 |
+
"ン",
|
919 |
+
"ス",
|
920 |
+
"・",
|
921 |
+
"ル",
|
922 |
+
"ト",
|
923 |
+
"リ",
|
924 |
+
"イ",
|
925 |
+
"ア",
|
926 |
+
"ラ",
|
927 |
+
"ッ",
|
928 |
+
"ク",
|
929 |
+
"ド",
|
930 |
+
"シ",
|
931 |
+
"レ",
|
932 |
+
"ジ",
|
933 |
+
"タ",
|
934 |
+
"フ",
|
935 |
+
"ロ",
|
936 |
+
"カ",
|
937 |
+
"テ",
|
938 |
+
"マ",
|
939 |
+
"ィ",
|
940 |
+
"グ",
|
941 |
+
"バ",
|
942 |
+
"ム",
|
943 |
+
"プ",
|
944 |
+
"オ",
|
945 |
+
"コ",
|
946 |
+
"デ",
|
947 |
+
"ニ",
|
948 |
+
"ウ",
|
949 |
+
"メ",
|
950 |
+
"サ",
|
951 |
+
"ビ",
|
952 |
+
"ナ",
|
953 |
+
"ブ",
|
954 |
+
"ャ",
|
955 |
+
"エ",
|
956 |
+
"ュ",
|
957 |
+
"チ",
|
958 |
+
"キ",
|
959 |
+
"ズ",
|
960 |
+
"ダ",
|
961 |
+
"パ",
|
962 |
+
"ミ",
|
963 |
+
"ェ",
|
964 |
+
"ョ",
|
965 |
+
"ハ",
|
966 |
+
"セ",
|
967 |
+
"ベ",
|
968 |
+
"ガ",
|
969 |
+
"モ",
|
970 |
+
"ツ",
|
971 |
+
"ネ",
|
972 |
+
"ボ",
|
973 |
+
"ソ",
|
974 |
+
"ノ",
|
975 |
+
"ァ",
|
976 |
+
"ヴ",
|
977 |
+
"ワ",
|
978 |
+
"ポ",
|
979 |
+
"ペ",
|
980 |
+
"ピ",
|
981 |
+
"ケ",
|
982 |
+
"ゴ",
|
983 |
+
"ギ",
|
984 |
+
"ザ",
|
985 |
+
"ホ",
|
986 |
+
"ゲ",
|
987 |
+
"ォ",
|
988 |
+
"ヤ",
|
989 |
+
"ヒ",
|
990 |
+
"ユ",
|
991 |
+
"ヨ",
|
992 |
+
"ヘ",
|
993 |
+
"ゼ",
|
994 |
+
"ヌ",
|
995 |
+
"ゥ",
|
996 |
+
"ゾ",
|
997 |
+
"ヶ",
|
998 |
+
"ヂ",
|
999 |
+
"ヲ",
|
1000 |
+
"ヅ",
|
1001 |
+
"ヵ",
|
1002 |
+
"ヱ",
|
1003 |
+
"ヰ",
|
1004 |
+
"ヮ",
|
1005 |
+
"ヽ",
|
1006 |
+
"゠",
|
1007 |
+
"ヾ",
|
1008 |
+
"ヷ",
|
1009 |
+
"ヿ",
|
1010 |
+
"ヸ",
|
1011 |
+
"ヹ",
|
1012 |
+
"ヺ",
|
1013 |
+
],
|
1014 |
+
# Jap-Hiragana
|
1015 |
+
"Japanese——": [
|
1016 |
+
"の",
|
1017 |
+
"に",
|
1018 |
+
"る",
|
1019 |
+
"た",
|
1020 |
+
"と",
|
1021 |
+
"は",
|
1022 |
+
"し",
|
1023 |
+
"い",
|
1024 |
+
"を",
|
1025 |
+
"で",
|
1026 |
+
"て",
|
1027 |
+
"が",
|
1028 |
+
"な",
|
1029 |
+
"れ",
|
1030 |
+
"か",
|
1031 |
+
"ら",
|
1032 |
+
"さ",
|
1033 |
+
"っ",
|
1034 |
+
"り",
|
1035 |
+
"す",
|
1036 |
+
"あ",
|
1037 |
+
"も",
|
1038 |
+
"こ",
|
1039 |
+
"ま",
|
1040 |
+
"う",
|
1041 |
+
"く",
|
1042 |
+
"よ",
|
1043 |
+
"き",
|
1044 |
+
"ん",
|
1045 |
+
"め",
|
1046 |
+
"お",
|
1047 |
+
"け",
|
1048 |
+
"そ",
|
1049 |
+
"つ",
|
1050 |
+
"だ",
|
1051 |
+
"や",
|
1052 |
+
"え",
|
1053 |
+
"ど",
|
1054 |
+
"わ",
|
1055 |
+
"ち",
|
1056 |
+
"み",
|
1057 |
+
"せ",
|
1058 |
+
"じ",
|
1059 |
+
"ば",
|
1060 |
+
"へ",
|
1061 |
+
"び",
|
1062 |
+
"ず",
|
1063 |
+
"ろ",
|
1064 |
+
"ほ",
|
1065 |
+
"げ",
|
1066 |
+
"む",
|
1067 |
+
"べ",
|
1068 |
+
"ひ",
|
1069 |
+
"ょ",
|
1070 |
+
"ゆ",
|
1071 |
+
"ぶ",
|
1072 |
+
"ご",
|
1073 |
+
"ゃ",
|
1074 |
+
"ね",
|
1075 |
+
"ふ",
|
1076 |
+
"ぐ",
|
1077 |
+
"ぎ",
|
1078 |
+
"ぼ",
|
1079 |
+
"ゅ",
|
1080 |
+
"づ",
|
1081 |
+
"ざ",
|
1082 |
+
"ぞ",
|
1083 |
+
"ぬ",
|
1084 |
+
"ぜ",
|
1085 |
+
"ぱ",
|
1086 |
+
"ぽ",
|
1087 |
+
"ぷ",
|
1088 |
+
"ぴ",
|
1089 |
+
"ぃ",
|
1090 |
+
"ぁ",
|
1091 |
+
"ぇ",
|
1092 |
+
"ぺ",
|
1093 |
+
"ゞ",
|
1094 |
+
"ぢ",
|
1095 |
+
"ぉ",
|
1096 |
+
"ぅ",
|
1097 |
+
"ゐ",
|
1098 |
+
"ゝ",
|
1099 |
+
"ゑ",
|
1100 |
+
"゛",
|
1101 |
+
"゜",
|
1102 |
+
"ゎ",
|
1103 |
+
"ゔ",
|
1104 |
+
"゚",
|
1105 |
+
"ゟ",
|
1106 |
+
"゙",
|
1107 |
+
"ゕ",
|
1108 |
+
"ゖ",
|
1109 |
+
],
|
1110 |
+
"Portuguese": [
|
1111 |
+
"a",
|
1112 |
+
"e",
|
1113 |
+
"o",
|
1114 |
+
"s",
|
1115 |
+
"i",
|
1116 |
+
"r",
|
1117 |
+
"d",
|
1118 |
+
"n",
|
1119 |
+
"t",
|
1120 |
+
"m",
|
1121 |
+
"u",
|
1122 |
+
"c",
|
1123 |
+
"l",
|
1124 |
+
"p",
|
1125 |
+
"g",
|
1126 |
+
"v",
|
1127 |
+
"b",
|
1128 |
+
"f",
|
1129 |
+
"h",
|
1130 |
+
"ã",
|
1131 |
+
"q",
|
1132 |
+
"é",
|
1133 |
+
"ç",
|
1134 |
+
"á",
|
1135 |
+
"z",
|
1136 |
+
"í",
|
1137 |
+
],
|
1138 |
+
"Swedish": [
|
1139 |
+
"e",
|
1140 |
+
"a",
|
1141 |
+
"n",
|
1142 |
+
"r",
|
1143 |
+
"t",
|
1144 |
+
"s",
|
1145 |
+
"i",
|
1146 |
+
"l",
|
1147 |
+
"d",
|
1148 |
+
"o",
|
1149 |
+
"m",
|
1150 |
+
"k",
|
1151 |
+
"g",
|
1152 |
+
"v",
|
1153 |
+
"h",
|
1154 |
+
"f",
|
1155 |
+
"u",
|
1156 |
+
"p",
|
1157 |
+
"ä",
|
1158 |
+
"c",
|
1159 |
+
"b",
|
1160 |
+
"ö",
|
1161 |
+
"å",
|
1162 |
+
"y",
|
1163 |
+
"j",
|
1164 |
+
"x",
|
1165 |
+
],
|
1166 |
+
"Chinese": [
|
1167 |
+
"的",
|
1168 |
+
"一",
|
1169 |
+
"是",
|
1170 |
+
"不",
|
1171 |
+
"了",
|
1172 |
+
"在",
|
1173 |
+
"人",
|
1174 |
+
"有",
|
1175 |
+
"我",
|
1176 |
+
"他",
|
1177 |
+
"这",
|
1178 |
+
"个",
|
1179 |
+
"们",
|
1180 |
+
"中",
|
1181 |
+
"来",
|
1182 |
+
"上",
|
1183 |
+
"大",
|
1184 |
+
"为",
|
1185 |
+
"和",
|
1186 |
+
"国",
|
1187 |
+
"地",
|
1188 |
+
"到",
|
1189 |
+
"以",
|
1190 |
+
"说",
|
1191 |
+
"时",
|
1192 |
+
"要",
|
1193 |
+
"就",
|
1194 |
+
"出",
|
1195 |
+
"会",
|
1196 |
+
"可",
|
1197 |
+
"也",
|
1198 |
+
"你",
|
1199 |
+
"对",
|
1200 |
+
"生",
|
1201 |
+
"能",
|
1202 |
+
"而",
|
1203 |
+
"子",
|
1204 |
+
"那",
|
1205 |
+
"得",
|
1206 |
+
"于",
|
1207 |
+
"着",
|
1208 |
+
"下",
|
1209 |
+
"自",
|
1210 |
+
"之",
|
1211 |
+
"年",
|
1212 |
+
"过",
|
1213 |
+
"发",
|
1214 |
+
"后",
|
1215 |
+
"作",
|
1216 |
+
"里",
|
1217 |
+
"用",
|
1218 |
+
"道",
|
1219 |
+
"行",
|
1220 |
+
"所",
|
1221 |
+
"然",
|
1222 |
+
"家",
|
1223 |
+
"种",
|
1224 |
+
"事",
|
1225 |
+
"成",
|
1226 |
+
"方",
|
1227 |
+
"多",
|
1228 |
+
"经",
|
1229 |
+
"么",
|
1230 |
+
"去",
|
1231 |
+
"法",
|
1232 |
+
"学",
|
1233 |
+
"如",
|
1234 |
+
"都",
|
1235 |
+
"同",
|
1236 |
+
"现",
|
1237 |
+
"当",
|
1238 |
+
"没",
|
1239 |
+
"动",
|
1240 |
+
"面",
|
1241 |
+
"起",
|
1242 |
+
"看",
|
1243 |
+
"定",
|
1244 |
+
"天",
|
1245 |
+
"分",
|
1246 |
+
"还",
|
1247 |
+
"进",
|
1248 |
+
"好",
|
1249 |
+
"小",
|
1250 |
+
"部",
|
1251 |
+
"其",
|
1252 |
+
"些",
|
1253 |
+
"主",
|
1254 |
+
"样",
|
1255 |
+
"理",
|
1256 |
+
"心",
|
1257 |
+
"她",
|
1258 |
+
"本",
|
1259 |
+
"前",
|
1260 |
+
"开",
|
1261 |
+
"但",
|
1262 |
+
"因",
|
1263 |
+
"只",
|
1264 |
+
"从",
|
1265 |
+
"想",
|
1266 |
+
"实",
|
1267 |
+
],
|
1268 |
+
"Ukrainian": [
|
1269 |
+
"о",
|
1270 |
+
"а",
|
1271 |
+
"н",
|
1272 |
+
"і",
|
1273 |
+
"и",
|
1274 |
+
"р",
|
1275 |
+
"в",
|
1276 |
+
"т",
|
1277 |
+
"е",
|
1278 |
+
"с",
|
1279 |
+
"к",
|
1280 |
+
"л",
|
1281 |
+
"у",
|
1282 |
+
"д",
|
1283 |
+
"м",
|
1284 |
+
"п",
|
1285 |
+
"з",
|
1286 |
+
"я",
|
1287 |
+
"ь",
|
1288 |
+
"б",
|
1289 |
+
"г",
|
1290 |
+
"й",
|
1291 |
+
"ч",
|
1292 |
+
"х",
|
1293 |
+
"ц",
|
1294 |
+
"ї",
|
1295 |
+
],
|
1296 |
+
"Norwegian": [
|
1297 |
+
"e",
|
1298 |
+
"r",
|
1299 |
+
"n",
|
1300 |
+
"t",
|
1301 |
+
"a",
|
1302 |
+
"s",
|
1303 |
+
"i",
|
1304 |
+
"o",
|
1305 |
+
"l",
|
1306 |
+
"d",
|
1307 |
+
"g",
|
1308 |
+
"k",
|
1309 |
+
"m",
|
1310 |
+
"v",
|
1311 |
+
"f",
|
1312 |
+
"p",
|
1313 |
+
"u",
|
1314 |
+
"b",
|
1315 |
+
"h",
|
1316 |
+
"å",
|
1317 |
+
"y",
|
1318 |
+
"j",
|
1319 |
+
"ø",
|
1320 |
+
"c",
|
1321 |
+
"æ",
|
1322 |
+
"w",
|
1323 |
+
],
|
1324 |
+
"Finnish": [
|
1325 |
+
"a",
|
1326 |
+
"i",
|
1327 |
+
"n",
|
1328 |
+
"t",
|
1329 |
+
"e",
|
1330 |
+
"s",
|
1331 |
+
"l",
|
1332 |
+
"o",
|
1333 |
+
"u",
|
1334 |
+
"k",
|
1335 |
+
"ä",
|
1336 |
+
"m",
|
1337 |
+
"r",
|
1338 |
+
"v",
|
1339 |
+
"j",
|
1340 |
+
"h",
|
1341 |
+
"p",
|
1342 |
+
"y",
|
1343 |
+
"d",
|
1344 |
+
"ö",
|
1345 |
+
"g",
|
1346 |
+
"c",
|
1347 |
+
"b",
|
1348 |
+
"f",
|
1349 |
+
"w",
|
1350 |
+
"z",
|
1351 |
+
],
|
1352 |
+
"Vietnamese": [
|
1353 |
+
"n",
|
1354 |
+
"h",
|
1355 |
+
"t",
|
1356 |
+
"i",
|
1357 |
+
"c",
|
1358 |
+
"g",
|
1359 |
+
"a",
|
1360 |
+
"o",
|
1361 |
+
"u",
|
1362 |
+
"m",
|
1363 |
+
"l",
|
1364 |
+
"r",
|
1365 |
+
"à",
|
1366 |
+
"đ",
|
1367 |
+
"s",
|
1368 |
+
"e",
|
1369 |
+
"v",
|
1370 |
+
"p",
|
1371 |
+
"b",
|
1372 |
+
"y",
|
1373 |
+
"ư",
|
1374 |
+
"d",
|
1375 |
+
"á",
|
1376 |
+
"k",
|
1377 |
+
"ộ",
|
1378 |
+
"ế",
|
1379 |
+
],
|
1380 |
+
"Czech": [
|
1381 |
+
"o",
|
1382 |
+
"e",
|
1383 |
+
"a",
|
1384 |
+
"n",
|
1385 |
+
"t",
|
1386 |
+
"s",
|
1387 |
+
"i",
|
1388 |
+
"l",
|
1389 |
+
"v",
|
1390 |
+
"r",
|
1391 |
+
"k",
|
1392 |
+
"d",
|
1393 |
+
"u",
|
1394 |
+
"m",
|
1395 |
+
"p",
|
1396 |
+
"í",
|
1397 |
+
"c",
|
1398 |
+
"h",
|
1399 |
+
"z",
|
1400 |
+
"á",
|
1401 |
+
"y",
|
1402 |
+
"j",
|
1403 |
+
"b",
|
1404 |
+
"ě",
|
1405 |
+
"é",
|
1406 |
+
"ř",
|
1407 |
+
],
|
1408 |
+
"Hungarian": [
|
1409 |
+
"e",
|
1410 |
+
"a",
|
1411 |
+
"t",
|
1412 |
+
"l",
|
1413 |
+
"s",
|
1414 |
+
"n",
|
1415 |
+
"k",
|
1416 |
+
"r",
|
1417 |
+
"i",
|
1418 |
+
"o",
|
1419 |
+
"z",
|
1420 |
+
"á",
|
1421 |
+
"é",
|
1422 |
+
"g",
|
1423 |
+
"m",
|
1424 |
+
"b",
|
1425 |
+
"y",
|
1426 |
+
"v",
|
1427 |
+
"d",
|
1428 |
+
"h",
|
1429 |
+
"u",
|
1430 |
+
"p",
|
1431 |
+
"j",
|
1432 |
+
"ö",
|
1433 |
+
"f",
|
1434 |
+
"c",
|
1435 |
+
],
|
1436 |
+
"Korean": [
|
1437 |
+
"이",
|
1438 |
+
"다",
|
1439 |
+
"에",
|
1440 |
+
"의",
|
1441 |
+
"는",
|
1442 |
+
"로",
|
1443 |
+
"하",
|
1444 |
+
"을",
|
1445 |
+
"가",
|
1446 |
+
"고",
|
1447 |
+
"지",
|
1448 |
+
"서",
|
1449 |
+
"한",
|
1450 |
+
"은",
|
1451 |
+
"기",
|
1452 |
+
"으",
|
1453 |
+
"년",
|
1454 |
+
"대",
|
1455 |
+
"사",
|
1456 |
+
"시",
|
1457 |
+
"를",
|
1458 |
+
"리",
|
1459 |
+
"도",
|
1460 |
+
"인",
|
1461 |
+
"스",
|
1462 |
+
"일",
|
1463 |
+
],
|
1464 |
+
"Indonesian": [
|
1465 |
+
"a",
|
1466 |
+
"n",
|
1467 |
+
"e",
|
1468 |
+
"i",
|
1469 |
+
"r",
|
1470 |
+
"t",
|
1471 |
+
"u",
|
1472 |
+
"s",
|
1473 |
+
"d",
|
1474 |
+
"k",
|
1475 |
+
"m",
|
1476 |
+
"l",
|
1477 |
+
"g",
|
1478 |
+
"p",
|
1479 |
+
"b",
|
1480 |
+
"o",
|
1481 |
+
"h",
|
1482 |
+
"y",
|
1483 |
+
"j",
|
1484 |
+
"c",
|
1485 |
+
"w",
|
1486 |
+
"f",
|
1487 |
+
"v",
|
1488 |
+
"z",
|
1489 |
+
"x",
|
1490 |
+
"q",
|
1491 |
+
],
|
1492 |
+
"Turkish": [
|
1493 |
+
"a",
|
1494 |
+
"e",
|
1495 |
+
"i",
|
1496 |
+
"n",
|
1497 |
+
"r",
|
1498 |
+
"l",
|
1499 |
+
"ı",
|
1500 |
+
"k",
|
1501 |
+
"d",
|
1502 |
+
"t",
|
1503 |
+
"s",
|
1504 |
+
"m",
|
1505 |
+
"y",
|
1506 |
+
"u",
|
1507 |
+
"o",
|
1508 |
+
"b",
|
1509 |
+
"ü",
|
1510 |
+
"ş",
|
1511 |
+
"v",
|
1512 |
+
"g",
|
1513 |
+
"z",
|
1514 |
+
"h",
|
1515 |
+
"c",
|
1516 |
+
"p",
|
1517 |
+
"ç",
|
1518 |
+
"ğ",
|
1519 |
+
],
|
1520 |
+
"Romanian": [
|
1521 |
+
"e",
|
1522 |
+
"i",
|
1523 |
+
"a",
|
1524 |
+
"r",
|
1525 |
+
"n",
|
1526 |
+
"t",
|
1527 |
+
"u",
|
1528 |
+
"l",
|
1529 |
+
"o",
|
1530 |
+
"c",
|
1531 |
+
"s",
|
1532 |
+
"d",
|
1533 |
+
"p",
|
1534 |
+
"m",
|
1535 |
+
"ă",
|
1536 |
+
"f",
|
1537 |
+
"v",
|
1538 |
+
"î",
|
1539 |
+
"g",
|
1540 |
+
"b",
|
1541 |
+
"ș",
|
1542 |
+
"ț",
|
1543 |
+
"z",
|
1544 |
+
"h",
|
1545 |
+
"â",
|
1546 |
+
"j",
|
1547 |
+
],
|
1548 |
+
"Farsi": [
|
1549 |
+
"ا",
|
1550 |
+
"ی",
|
1551 |
+
"ر",
|
1552 |
+
"د",
|
1553 |
+
"ن",
|
1554 |
+
"ه",
|
1555 |
+
"و",
|
1556 |
+
"م",
|
1557 |
+
"ت",
|
1558 |
+
"ب",
|
1559 |
+
"س",
|
1560 |
+
"ل",
|
1561 |
+
"ک",
|
1562 |
+
"ش",
|
1563 |
+
"ز",
|
1564 |
+
"ف",
|
1565 |
+
"گ",
|
1566 |
+
"ع",
|
1567 |
+
"خ",
|
1568 |
+
"ق",
|
1569 |
+
"ج",
|
1570 |
+
"آ",
|
1571 |
+
"پ",
|
1572 |
+
"ح",
|
1573 |
+
"ط",
|
1574 |
+
"ص",
|
1575 |
+
],
|
1576 |
+
"Arabic": [
|
1577 |
+
"ا",
|
1578 |
+
"ل",
|
1579 |
+
"ي",
|
1580 |
+
"م",
|
1581 |
+
"و",
|
1582 |
+
"ن",
|
1583 |
+
"ر",
|
1584 |
+
"ت",
|
1585 |
+
"ب",
|
1586 |
+
"ة",
|
1587 |
+
"ع",
|
1588 |
+
"د",
|
1589 |
+
"س",
|
1590 |
+
"ف",
|
1591 |
+
"ه",
|
1592 |
+
"ك",
|
1593 |
+
"ق",
|
1594 |
+
"أ",
|
1595 |
+
"ح",
|
1596 |
+
"ج",
|
1597 |
+
"ش",
|
1598 |
+
"ط",
|
1599 |
+
"ص",
|
1600 |
+
"ى",
|
1601 |
+
"خ",
|
1602 |
+
"إ",
|
1603 |
+
],
|
1604 |
+
"Danish": [
|
1605 |
+
"e",
|
1606 |
+
"r",
|
1607 |
+
"n",
|
1608 |
+
"t",
|
1609 |
+
"a",
|
1610 |
+
"i",
|
1611 |
+
"s",
|
1612 |
+
"d",
|
1613 |
+
"l",
|
1614 |
+
"o",
|
1615 |
+
"g",
|
1616 |
+
"m",
|
1617 |
+
"k",
|
1618 |
+
"f",
|
1619 |
+
"v",
|
1620 |
+
"u",
|
1621 |
+
"b",
|
1622 |
+
"h",
|
1623 |
+
"p",
|
1624 |
+
"å",
|
1625 |
+
"y",
|
1626 |
+
"ø",
|
1627 |
+
"æ",
|
1628 |
+
"c",
|
1629 |
+
"j",
|
1630 |
+
"w",
|
1631 |
+
],
|
1632 |
+
"Serbian": [
|
1633 |
+
"а",
|
1634 |
+
"и",
|
1635 |
+
"о",
|
1636 |
+
"е",
|
1637 |
+
"н",
|
1638 |
+
"р",
|
1639 |
+
"с",
|
1640 |
+
"у",
|
1641 |
+
"т",
|
1642 |
+
"к",
|
1643 |
+
"ј",
|
1644 |
+
"в",
|
1645 |
+
"д",
|
1646 |
+
"м",
|
1647 |
+
"п",
|
1648 |
+
"л",
|
1649 |
+
"г",
|
1650 |
+
"з",
|
1651 |
+
"б",
|
1652 |
+
"a",
|
1653 |
+
"i",
|
1654 |
+
"e",
|
1655 |
+
"o",
|
1656 |
+
"n",
|
1657 |
+
"ц",
|
1658 |
+
"ш",
|
1659 |
+
],
|
1660 |
+
"Lithuanian": [
|
1661 |
+
"i",
|
1662 |
+
"a",
|
1663 |
+
"s",
|
1664 |
+
"o",
|
1665 |
+
"r",
|
1666 |
+
"e",
|
1667 |
+
"t",
|
1668 |
+
"n",
|
1669 |
+
"u",
|
1670 |
+
"k",
|
1671 |
+
"m",
|
1672 |
+
"l",
|
1673 |
+
"p",
|
1674 |
+
"v",
|
1675 |
+
"d",
|
1676 |
+
"j",
|
1677 |
+
"g",
|
1678 |
+
"ė",
|
1679 |
+
"b",
|
1680 |
+
"y",
|
1681 |
+
"ų",
|
1682 |
+
"š",
|
1683 |
+
"ž",
|
1684 |
+
"c",
|
1685 |
+
"ą",
|
1686 |
+
"į",
|
1687 |
+
],
|
1688 |
+
"Slovene": [
|
1689 |
+
"e",
|
1690 |
+
"a",
|
1691 |
+
"i",
|
1692 |
+
"o",
|
1693 |
+
"n",
|
1694 |
+
"r",
|
1695 |
+
"s",
|
1696 |
+
"l",
|
1697 |
+
"t",
|
1698 |
+
"j",
|
1699 |
+
"v",
|
1700 |
+
"k",
|
1701 |
+
"d",
|
1702 |
+
"p",
|
1703 |
+
"m",
|
1704 |
+
"u",
|
1705 |
+
"z",
|
1706 |
+
"b",
|
1707 |
+
"g",
|
1708 |
+
"h",
|
1709 |
+
"č",
|
1710 |
+
"c",
|
1711 |
+
"š",
|
1712 |
+
"ž",
|
1713 |
+
"f",
|
1714 |
+
"y",
|
1715 |
+
],
|
1716 |
+
"Slovak": [
|
1717 |
+
"o",
|
1718 |
+
"a",
|
1719 |
+
"e",
|
1720 |
+
"n",
|
1721 |
+
"i",
|
1722 |
+
"r",
|
1723 |
+
"v",
|
1724 |
+
"t",
|
1725 |
+
"s",
|
1726 |
+
"l",
|
1727 |
+
"k",
|
1728 |
+
"d",
|
1729 |
+
"m",
|
1730 |
+
"p",
|
1731 |
+
"u",
|
1732 |
+
"c",
|
1733 |
+
"h",
|
1734 |
+
"j",
|
1735 |
+
"b",
|
1736 |
+
"z",
|
1737 |
+
"á",
|
1738 |
+
"y",
|
1739 |
+
"ý",
|
1740 |
+
"í",
|
1741 |
+
"č",
|
1742 |
+
"é",
|
1743 |
+
],
|
1744 |
+
"Hebrew": [
|
1745 |
+
"י",
|
1746 |
+
"ו",
|
1747 |
+
"ה",
|
1748 |
+
"ל",
|
1749 |
+
"ר",
|
1750 |
+
"ב",
|
1751 |
+
"ת",
|
1752 |
+
"מ",
|
1753 |
+
"א",
|
1754 |
+
"ש",
|
1755 |
+
"נ",
|
1756 |
+
"ע",
|
1757 |
+
"ם",
|
1758 |
+
"ד",
|
1759 |
+
"ק",
|
1760 |
+
"ח",
|
1761 |
+
"פ",
|
1762 |
+
"ס",
|
1763 |
+
"כ",
|
1764 |
+
"ג",
|
1765 |
+
"ט",
|
1766 |
+
"צ",
|
1767 |
+
"ן",
|
1768 |
+
"ז",
|
1769 |
+
"ך",
|
1770 |
+
],
|
1771 |
+
"Bulgarian": [
|
1772 |
+
"а",
|
1773 |
+
"и",
|
1774 |
+
"о",
|
1775 |
+
"е",
|
1776 |
+
"н",
|
1777 |
+
"т",
|
1778 |
+
"р",
|
1779 |
+
"с",
|
1780 |
+
"в",
|
1781 |
+
"л",
|
1782 |
+
"к",
|
1783 |
+
"д",
|
1784 |
+
"п",
|
1785 |
+
"м",
|
1786 |
+
"з",
|
1787 |
+
"г",
|
1788 |
+
"я",
|
1789 |
+
"ъ",
|
1790 |
+
"у",
|
1791 |
+
"б",
|
1792 |
+
"ч",
|
1793 |
+
"ц",
|
1794 |
+
"й",
|
1795 |
+
"ж",
|
1796 |
+
"щ",
|
1797 |
+
"х",
|
1798 |
+
],
|
1799 |
+
"Croatian": [
|
1800 |
+
"a",
|
1801 |
+
"i",
|
1802 |
+
"o",
|
1803 |
+
"e",
|
1804 |
+
"n",
|
1805 |
+
"r",
|
1806 |
+
"j",
|
1807 |
+
"s",
|
1808 |
+
"t",
|
1809 |
+
"u",
|
1810 |
+
"k",
|
1811 |
+
"l",
|
1812 |
+
"v",
|
1813 |
+
"d",
|
1814 |
+
"m",
|
1815 |
+
"p",
|
1816 |
+
"g",
|
1817 |
+
"z",
|
1818 |
+
"b",
|
1819 |
+
"c",
|
1820 |
+
"č",
|
1821 |
+
"h",
|
1822 |
+
"š",
|
1823 |
+
"ž",
|
1824 |
+
"ć",
|
1825 |
+
"f",
|
1826 |
+
],
|
1827 |
+
"Hindi": [
|
1828 |
+
"क",
|
1829 |
+
"र",
|
1830 |
+
"स",
|
1831 |
+
"न",
|
1832 |
+
"त",
|
1833 |
+
"म",
|
1834 |
+
"ह",
|
1835 |
+
"प",
|
1836 |
+
"य",
|
1837 |
+
"ल",
|
1838 |
+
"व",
|
1839 |
+
"ज",
|
1840 |
+
"द",
|
1841 |
+
"ग",
|
1842 |
+
"ब",
|
1843 |
+
"श",
|
1844 |
+
"ट",
|
1845 |
+
"अ",
|
1846 |
+
"ए",
|
1847 |
+
"थ",
|
1848 |
+
"भ",
|
1849 |
+
"ड",
|
1850 |
+
"च",
|
1851 |
+
"ध",
|
1852 |
+
"ष",
|
1853 |
+
"इ",
|
1854 |
+
],
|
1855 |
+
"Estonian": [
|
1856 |
+
"a",
|
1857 |
+
"i",
|
1858 |
+
"e",
|
1859 |
+
"s",
|
1860 |
+
"t",
|
1861 |
+
"l",
|
1862 |
+
"u",
|
1863 |
+
"n",
|
1864 |
+
"o",
|
1865 |
+
"k",
|
1866 |
+
"r",
|
1867 |
+
"d",
|
1868 |
+
"m",
|
1869 |
+
"v",
|
1870 |
+
"g",
|
1871 |
+
"p",
|
1872 |
+
"j",
|
1873 |
+
"h",
|
1874 |
+
"ä",
|
1875 |
+
"b",
|
1876 |
+
"õ",
|
1877 |
+
"ü",
|
1878 |
+
"f",
|
1879 |
+
"c",
|
1880 |
+
"ö",
|
1881 |
+
"y",
|
1882 |
+
],
|
1883 |
+
"Thai": [
|
1884 |
+
"า",
|
1885 |
+
"น",
|
1886 |
+
"ร",
|
1887 |
+
"อ",
|
1888 |
+
"ก",
|
1889 |
+
"เ",
|
1890 |
+
"ง",
|
1891 |
+
"ม",
|
1892 |
+
"ย",
|
1893 |
+
"ล",
|
1894 |
+
"ว",
|
1895 |
+
"ด",
|
1896 |
+
"ท",
|
1897 |
+
"ส",
|
1898 |
+
"ต",
|
1899 |
+
"ะ",
|
1900 |
+
"ป",
|
1901 |
+
"บ",
|
1902 |
+
"ค",
|
1903 |
+
"ห",
|
1904 |
+
"แ",
|
1905 |
+
"จ",
|
1906 |
+
"พ",
|
1907 |
+
"ช",
|
1908 |
+
"ข",
|
1909 |
+
"ใ",
|
1910 |
+
],
|
1911 |
+
"Greek": [
|
1912 |
+
"α",
|
1913 |
+
"τ",
|
1914 |
+
"ο",
|
1915 |
+
"ι",
|
1916 |
+
"ε",
|
1917 |
+
"ν",
|
1918 |
+
"ρ",
|
1919 |
+
"σ",
|
1920 |
+
"κ",
|
1921 |
+
"η",
|
1922 |
+
"π",
|
1923 |
+
"ς",
|
1924 |
+
"υ",
|
1925 |
+
"μ",
|
1926 |
+
"λ",
|
1927 |
+
"ί",
|
1928 |
+
"ό",
|
1929 |
+
"ά",
|
1930 |
+
"γ",
|
1931 |
+
"έ",
|
1932 |
+
"δ",
|
1933 |
+
"ή",
|
1934 |
+
"ω",
|
1935 |
+
"χ",
|
1936 |
+
"θ",
|
1937 |
+
"ύ",
|
1938 |
+
],
|
1939 |
+
"Tamil": [
|
1940 |
+
"க",
|
1941 |
+
"த",
|
1942 |
+
"ப",
|
1943 |
+
"ட",
|
1944 |
+
"ர",
|
1945 |
+
"ம",
|
1946 |
+
"ல",
|
1947 |
+
"ன",
|
1948 |
+
"வ",
|
1949 |
+
"ற",
|
1950 |
+
"ய",
|
1951 |
+
"ள",
|
1952 |
+
"ச",
|
1953 |
+
"ந",
|
1954 |
+
"இ",
|
1955 |
+
"ண",
|
1956 |
+
"அ",
|
1957 |
+
"ஆ",
|
1958 |
+
"ழ",
|
1959 |
+
"ங",
|
1960 |
+
"எ",
|
1961 |
+
"உ",
|
1962 |
+
"ஒ",
|
1963 |
+
"ஸ",
|
1964 |
+
],
|
1965 |
+
"Kazakh": [
|
1966 |
+
"а",
|
1967 |
+
"ы",
|
1968 |
+
"е",
|
1969 |
+
"н",
|
1970 |
+
"т",
|
1971 |
+
"р",
|
1972 |
+
"л",
|
1973 |
+
"і",
|
1974 |
+
"д",
|
1975 |
+
"с",
|
1976 |
+
"м",
|
1977 |
+
"қ",
|
1978 |
+
"к",
|
1979 |
+
"о",
|
1980 |
+
"б",
|
1981 |
+
"и",
|
1982 |
+
"у",
|
1983 |
+
"ғ",
|
1984 |
+
"ж",
|
1985 |
+
"ң",
|
1986 |
+
"з",
|
1987 |
+
"ш",
|
1988 |
+
"й",
|
1989 |
+
"п",
|
1990 |
+
"г",
|
1991 |
+
"ө",
|
1992 |
+
],
|
1993 |
+
}
|
1994 |
+
|
1995 |
+
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
lib/python3.11/site-packages/charset_normalizer/legacy.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, Optional, Union
|
2 |
+
from warnings import warn
|
3 |
+
|
4 |
+
from .api import from_bytes
|
5 |
+
from .constant import CHARDET_CORRESPONDENCE
|
6 |
+
|
7 |
+
|
8 |
+
def detect(
|
9 |
+
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
10 |
+
) -> Dict[str, Optional[Union[str, float]]]:
|
11 |
+
"""
|
12 |
+
chardet legacy method
|
13 |
+
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
14 |
+
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
15 |
+
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
16 |
+
further information. Not planned for removal.
|
17 |
+
|
18 |
+
:param byte_str: The byte sequence to examine.
|
19 |
+
:param should_rename_legacy: Should we rename legacy encodings
|
20 |
+
to their more modern equivalents?
|
21 |
+
"""
|
22 |
+
if len(kwargs):
|
23 |
+
warn(
|
24 |
+
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
25 |
+
)
|
26 |
+
|
27 |
+
if not isinstance(byte_str, (bytearray, bytes)):
|
28 |
+
raise TypeError( # pragma: nocover
|
29 |
+
"Expected object of type bytes or bytearray, got: "
|
30 |
+
"{0}".format(type(byte_str))
|
31 |
+
)
|
32 |
+
|
33 |
+
if isinstance(byte_str, bytearray):
|
34 |
+
byte_str = bytes(byte_str)
|
35 |
+
|
36 |
+
r = from_bytes(byte_str).best()
|
37 |
+
|
38 |
+
encoding = r.encoding if r is not None else None
|
39 |
+
language = r.language if r is not None and r.language != "Unknown" else ""
|
40 |
+
confidence = 1.0 - r.chaos if r is not None else None
|
41 |
+
|
42 |
+
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
43 |
+
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
44 |
+
if r is not None and encoding == "utf_8" and r.bom:
|
45 |
+
encoding += "_sig"
|
46 |
+
|
47 |
+
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
48 |
+
encoding = CHARDET_CORRESPONDENCE[encoding]
|
49 |
+
|
50 |
+
return {
|
51 |
+
"encoding": encoding,
|
52 |
+
"language": language,
|
53 |
+
"confidence": confidence,
|
54 |
+
}
|
lib/python3.11/site-packages/charset_normalizer/md.cpython-311-darwin.so
ADDED
Binary file (50.1 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/md.py
ADDED
@@ -0,0 +1,615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import lru_cache
|
2 |
+
from logging import getLogger
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
+
from .constant import (
|
6 |
+
COMMON_SAFE_ASCII_CHARACTERS,
|
7 |
+
TRACE,
|
8 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
9 |
+
)
|
10 |
+
from .utils import (
|
11 |
+
is_accentuated,
|
12 |
+
is_arabic,
|
13 |
+
is_arabic_isolated_form,
|
14 |
+
is_case_variable,
|
15 |
+
is_cjk,
|
16 |
+
is_emoticon,
|
17 |
+
is_hangul,
|
18 |
+
is_hiragana,
|
19 |
+
is_katakana,
|
20 |
+
is_latin,
|
21 |
+
is_punctuation,
|
22 |
+
is_separator,
|
23 |
+
is_symbol,
|
24 |
+
is_thai,
|
25 |
+
is_unprintable,
|
26 |
+
remove_accent,
|
27 |
+
unicode_range,
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class MessDetectorPlugin:
|
32 |
+
"""
|
33 |
+
Base abstract class used for mess detection plugins.
|
34 |
+
All detectors MUST extend and implement given methods.
|
35 |
+
"""
|
36 |
+
|
37 |
+
def eligible(self, character: str) -> bool:
|
38 |
+
"""
|
39 |
+
Determine if given character should be fed in.
|
40 |
+
"""
|
41 |
+
raise NotImplementedError # pragma: nocover
|
42 |
+
|
43 |
+
def feed(self, character: str) -> None:
|
44 |
+
"""
|
45 |
+
The main routine to be executed upon character.
|
46 |
+
Insert the logic in witch the text would be considered chaotic.
|
47 |
+
"""
|
48 |
+
raise NotImplementedError # pragma: nocover
|
49 |
+
|
50 |
+
def reset(self) -> None: # pragma: no cover
|
51 |
+
"""
|
52 |
+
Permit to reset the plugin to the initial state.
|
53 |
+
"""
|
54 |
+
raise NotImplementedError
|
55 |
+
|
56 |
+
@property
|
57 |
+
def ratio(self) -> float:
|
58 |
+
"""
|
59 |
+
Compute the chaos ratio based on what your feed() has seen.
|
60 |
+
Must NOT be lower than 0.; No restriction gt 0.
|
61 |
+
"""
|
62 |
+
raise NotImplementedError # pragma: nocover
|
63 |
+
|
64 |
+
|
65 |
+
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
66 |
+
def __init__(self) -> None:
|
67 |
+
self._punctuation_count: int = 0
|
68 |
+
self._symbol_count: int = 0
|
69 |
+
self._character_count: int = 0
|
70 |
+
|
71 |
+
self._last_printable_char: Optional[str] = None
|
72 |
+
self._frenzy_symbol_in_word: bool = False
|
73 |
+
|
74 |
+
def eligible(self, character: str) -> bool:
|
75 |
+
return character.isprintable()
|
76 |
+
|
77 |
+
def feed(self, character: str) -> None:
|
78 |
+
self._character_count += 1
|
79 |
+
|
80 |
+
if (
|
81 |
+
character != self._last_printable_char
|
82 |
+
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
83 |
+
):
|
84 |
+
if is_punctuation(character):
|
85 |
+
self._punctuation_count += 1
|
86 |
+
elif (
|
87 |
+
character.isdigit() is False
|
88 |
+
and is_symbol(character)
|
89 |
+
and is_emoticon(character) is False
|
90 |
+
):
|
91 |
+
self._symbol_count += 2
|
92 |
+
|
93 |
+
self._last_printable_char = character
|
94 |
+
|
95 |
+
def reset(self) -> None: # pragma: no cover
|
96 |
+
self._punctuation_count = 0
|
97 |
+
self._character_count = 0
|
98 |
+
self._symbol_count = 0
|
99 |
+
|
100 |
+
@property
|
101 |
+
def ratio(self) -> float:
|
102 |
+
if self._character_count == 0:
|
103 |
+
return 0.0
|
104 |
+
|
105 |
+
ratio_of_punctuation: float = (
|
106 |
+
self._punctuation_count + self._symbol_count
|
107 |
+
) / self._character_count
|
108 |
+
|
109 |
+
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
110 |
+
|
111 |
+
|
112 |
+
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
113 |
+
def __init__(self) -> None:
|
114 |
+
self._character_count: int = 0
|
115 |
+
self._accentuated_count: int = 0
|
116 |
+
|
117 |
+
def eligible(self, character: str) -> bool:
|
118 |
+
return character.isalpha()
|
119 |
+
|
120 |
+
def feed(self, character: str) -> None:
|
121 |
+
self._character_count += 1
|
122 |
+
|
123 |
+
if is_accentuated(character):
|
124 |
+
self._accentuated_count += 1
|
125 |
+
|
126 |
+
def reset(self) -> None: # pragma: no cover
|
127 |
+
self._character_count = 0
|
128 |
+
self._accentuated_count = 0
|
129 |
+
|
130 |
+
@property
|
131 |
+
def ratio(self) -> float:
|
132 |
+
if self._character_count < 8:
|
133 |
+
return 0.0
|
134 |
+
|
135 |
+
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
136 |
+
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
137 |
+
|
138 |
+
|
139 |
+
class UnprintablePlugin(MessDetectorPlugin):
|
140 |
+
def __init__(self) -> None:
|
141 |
+
self._unprintable_count: int = 0
|
142 |
+
self._character_count: int = 0
|
143 |
+
|
144 |
+
def eligible(self, character: str) -> bool:
|
145 |
+
return True
|
146 |
+
|
147 |
+
def feed(self, character: str) -> None:
|
148 |
+
if is_unprintable(character):
|
149 |
+
self._unprintable_count += 1
|
150 |
+
self._character_count += 1
|
151 |
+
|
152 |
+
def reset(self) -> None: # pragma: no cover
|
153 |
+
self._unprintable_count = 0
|
154 |
+
|
155 |
+
@property
|
156 |
+
def ratio(self) -> float:
|
157 |
+
if self._character_count == 0:
|
158 |
+
return 0.0
|
159 |
+
|
160 |
+
return (self._unprintable_count * 8) / self._character_count
|
161 |
+
|
162 |
+
|
163 |
+
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
164 |
+
def __init__(self) -> None:
|
165 |
+
self._successive_count: int = 0
|
166 |
+
self._character_count: int = 0
|
167 |
+
|
168 |
+
self._last_latin_character: Optional[str] = None
|
169 |
+
|
170 |
+
def eligible(self, character: str) -> bool:
|
171 |
+
return character.isalpha() and is_latin(character)
|
172 |
+
|
173 |
+
def feed(self, character: str) -> None:
|
174 |
+
self._character_count += 1
|
175 |
+
if (
|
176 |
+
self._last_latin_character is not None
|
177 |
+
and is_accentuated(character)
|
178 |
+
and is_accentuated(self._last_latin_character)
|
179 |
+
):
|
180 |
+
if character.isupper() and self._last_latin_character.isupper():
|
181 |
+
self._successive_count += 1
|
182 |
+
# Worse if its the same char duplicated with different accent.
|
183 |
+
if remove_accent(character) == remove_accent(self._last_latin_character):
|
184 |
+
self._successive_count += 1
|
185 |
+
self._last_latin_character = character
|
186 |
+
|
187 |
+
def reset(self) -> None: # pragma: no cover
|
188 |
+
self._successive_count = 0
|
189 |
+
self._character_count = 0
|
190 |
+
self._last_latin_character = None
|
191 |
+
|
192 |
+
@property
|
193 |
+
def ratio(self) -> float:
|
194 |
+
if self._character_count == 0:
|
195 |
+
return 0.0
|
196 |
+
|
197 |
+
return (self._successive_count * 2) / self._character_count
|
198 |
+
|
199 |
+
|
200 |
+
class SuspiciousRange(MessDetectorPlugin):
|
201 |
+
def __init__(self) -> None:
|
202 |
+
self._suspicious_successive_range_count: int = 0
|
203 |
+
self._character_count: int = 0
|
204 |
+
self._last_printable_seen: Optional[str] = None
|
205 |
+
|
206 |
+
def eligible(self, character: str) -> bool:
|
207 |
+
return character.isprintable()
|
208 |
+
|
209 |
+
def feed(self, character: str) -> None:
|
210 |
+
self._character_count += 1
|
211 |
+
|
212 |
+
if (
|
213 |
+
character.isspace()
|
214 |
+
or is_punctuation(character)
|
215 |
+
or character in COMMON_SAFE_ASCII_CHARACTERS
|
216 |
+
):
|
217 |
+
self._last_printable_seen = None
|
218 |
+
return
|
219 |
+
|
220 |
+
if self._last_printable_seen is None:
|
221 |
+
self._last_printable_seen = character
|
222 |
+
return
|
223 |
+
|
224 |
+
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
225 |
+
unicode_range_b: Optional[str] = unicode_range(character)
|
226 |
+
|
227 |
+
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
228 |
+
self._suspicious_successive_range_count += 1
|
229 |
+
|
230 |
+
self._last_printable_seen = character
|
231 |
+
|
232 |
+
def reset(self) -> None: # pragma: no cover
|
233 |
+
self._character_count = 0
|
234 |
+
self._suspicious_successive_range_count = 0
|
235 |
+
self._last_printable_seen = None
|
236 |
+
|
237 |
+
@property
|
238 |
+
def ratio(self) -> float:
|
239 |
+
if self._character_count <= 24:
|
240 |
+
return 0.0
|
241 |
+
|
242 |
+
ratio_of_suspicious_range_usage: float = (
|
243 |
+
self._suspicious_successive_range_count * 2
|
244 |
+
) / self._character_count
|
245 |
+
|
246 |
+
return ratio_of_suspicious_range_usage
|
247 |
+
|
248 |
+
|
249 |
+
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
250 |
+
def __init__(self) -> None:
|
251 |
+
self._word_count: int = 0
|
252 |
+
self._bad_word_count: int = 0
|
253 |
+
self._foreign_long_count: int = 0
|
254 |
+
|
255 |
+
self._is_current_word_bad: bool = False
|
256 |
+
self._foreign_long_watch: bool = False
|
257 |
+
|
258 |
+
self._character_count: int = 0
|
259 |
+
self._bad_character_count: int = 0
|
260 |
+
|
261 |
+
self._buffer: str = ""
|
262 |
+
self._buffer_accent_count: int = 0
|
263 |
+
|
264 |
+
def eligible(self, character: str) -> bool:
|
265 |
+
return True
|
266 |
+
|
267 |
+
def feed(self, character: str) -> None:
|
268 |
+
if character.isalpha():
|
269 |
+
self._buffer += character
|
270 |
+
if is_accentuated(character):
|
271 |
+
self._buffer_accent_count += 1
|
272 |
+
if (
|
273 |
+
self._foreign_long_watch is False
|
274 |
+
and (is_latin(character) is False or is_accentuated(character))
|
275 |
+
and is_cjk(character) is False
|
276 |
+
and is_hangul(character) is False
|
277 |
+
and is_katakana(character) is False
|
278 |
+
and is_hiragana(character) is False
|
279 |
+
and is_thai(character) is False
|
280 |
+
):
|
281 |
+
self._foreign_long_watch = True
|
282 |
+
return
|
283 |
+
if not self._buffer:
|
284 |
+
return
|
285 |
+
if (
|
286 |
+
character.isspace() or is_punctuation(character) or is_separator(character)
|
287 |
+
) and self._buffer:
|
288 |
+
self._word_count += 1
|
289 |
+
buffer_length: int = len(self._buffer)
|
290 |
+
|
291 |
+
self._character_count += buffer_length
|
292 |
+
|
293 |
+
if buffer_length >= 4:
|
294 |
+
if self._buffer_accent_count / buffer_length > 0.34:
|
295 |
+
self._is_current_word_bad = True
|
296 |
+
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
297 |
+
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
298 |
+
if (
|
299 |
+
is_accentuated(self._buffer[-1])
|
300 |
+
and self._buffer[-1].isupper()
|
301 |
+
and all(_.isupper() for _ in self._buffer) is False
|
302 |
+
):
|
303 |
+
self._foreign_long_count += 1
|
304 |
+
self._is_current_word_bad = True
|
305 |
+
if buffer_length >= 24 and self._foreign_long_watch:
|
306 |
+
camel_case_dst = [
|
307 |
+
i
|
308 |
+
for c, i in zip(self._buffer, range(0, buffer_length))
|
309 |
+
if c.isupper()
|
310 |
+
]
|
311 |
+
probable_camel_cased: bool = False
|
312 |
+
|
313 |
+
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
314 |
+
probable_camel_cased = True
|
315 |
+
|
316 |
+
if not probable_camel_cased:
|
317 |
+
self._foreign_long_count += 1
|
318 |
+
self._is_current_word_bad = True
|
319 |
+
|
320 |
+
if self._is_current_word_bad:
|
321 |
+
self._bad_word_count += 1
|
322 |
+
self._bad_character_count += len(self._buffer)
|
323 |
+
self._is_current_word_bad = False
|
324 |
+
|
325 |
+
self._foreign_long_watch = False
|
326 |
+
self._buffer = ""
|
327 |
+
self._buffer_accent_count = 0
|
328 |
+
elif (
|
329 |
+
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
330 |
+
and character.isdigit() is False
|
331 |
+
and is_symbol(character)
|
332 |
+
):
|
333 |
+
self._is_current_word_bad = True
|
334 |
+
self._buffer += character
|
335 |
+
|
336 |
+
def reset(self) -> None: # pragma: no cover
|
337 |
+
self._buffer = ""
|
338 |
+
self._is_current_word_bad = False
|
339 |
+
self._foreign_long_watch = False
|
340 |
+
self._bad_word_count = 0
|
341 |
+
self._word_count = 0
|
342 |
+
self._character_count = 0
|
343 |
+
self._bad_character_count = 0
|
344 |
+
self._foreign_long_count = 0
|
345 |
+
|
346 |
+
@property
|
347 |
+
def ratio(self) -> float:
|
348 |
+
if self._word_count <= 10 and self._foreign_long_count == 0:
|
349 |
+
return 0.0
|
350 |
+
|
351 |
+
return self._bad_character_count / self._character_count
|
352 |
+
|
353 |
+
|
354 |
+
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
355 |
+
"""
|
356 |
+
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
357 |
+
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
358 |
+
"""
|
359 |
+
|
360 |
+
def __init__(self) -> None:
|
361 |
+
self._wrong_stop_count: int = 0
|
362 |
+
self._cjk_character_count: int = 0
|
363 |
+
|
364 |
+
def eligible(self, character: str) -> bool:
|
365 |
+
return True
|
366 |
+
|
367 |
+
def feed(self, character: str) -> None:
|
368 |
+
if character in {"丅", "丄"}:
|
369 |
+
self._wrong_stop_count += 1
|
370 |
+
return
|
371 |
+
if is_cjk(character):
|
372 |
+
self._cjk_character_count += 1
|
373 |
+
|
374 |
+
def reset(self) -> None: # pragma: no cover
|
375 |
+
self._wrong_stop_count = 0
|
376 |
+
self._cjk_character_count = 0
|
377 |
+
|
378 |
+
@property
|
379 |
+
def ratio(self) -> float:
|
380 |
+
if self._cjk_character_count < 16:
|
381 |
+
return 0.0
|
382 |
+
return self._wrong_stop_count / self._cjk_character_count
|
383 |
+
|
384 |
+
|
385 |
+
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
386 |
+
def __init__(self) -> None:
|
387 |
+
self._buf: bool = False
|
388 |
+
|
389 |
+
self._character_count_since_last_sep: int = 0
|
390 |
+
|
391 |
+
self._successive_upper_lower_count: int = 0
|
392 |
+
self._successive_upper_lower_count_final: int = 0
|
393 |
+
|
394 |
+
self._character_count: int = 0
|
395 |
+
|
396 |
+
self._last_alpha_seen: Optional[str] = None
|
397 |
+
self._current_ascii_only: bool = True
|
398 |
+
|
399 |
+
def eligible(self, character: str) -> bool:
|
400 |
+
return True
|
401 |
+
|
402 |
+
def feed(self, character: str) -> None:
|
403 |
+
is_concerned = character.isalpha() and is_case_variable(character)
|
404 |
+
chunk_sep = is_concerned is False
|
405 |
+
|
406 |
+
if chunk_sep and self._character_count_since_last_sep > 0:
|
407 |
+
if (
|
408 |
+
self._character_count_since_last_sep <= 64
|
409 |
+
and character.isdigit() is False
|
410 |
+
and self._current_ascii_only is False
|
411 |
+
):
|
412 |
+
self._successive_upper_lower_count_final += (
|
413 |
+
self._successive_upper_lower_count
|
414 |
+
)
|
415 |
+
|
416 |
+
self._successive_upper_lower_count = 0
|
417 |
+
self._character_count_since_last_sep = 0
|
418 |
+
self._last_alpha_seen = None
|
419 |
+
self._buf = False
|
420 |
+
self._character_count += 1
|
421 |
+
self._current_ascii_only = True
|
422 |
+
|
423 |
+
return
|
424 |
+
|
425 |
+
if self._current_ascii_only is True and character.isascii() is False:
|
426 |
+
self._current_ascii_only = False
|
427 |
+
|
428 |
+
if self._last_alpha_seen is not None:
|
429 |
+
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
430 |
+
character.islower() and self._last_alpha_seen.isupper()
|
431 |
+
):
|
432 |
+
if self._buf is True:
|
433 |
+
self._successive_upper_lower_count += 2
|
434 |
+
self._buf = False
|
435 |
+
else:
|
436 |
+
self._buf = True
|
437 |
+
else:
|
438 |
+
self._buf = False
|
439 |
+
|
440 |
+
self._character_count += 1
|
441 |
+
self._character_count_since_last_sep += 1
|
442 |
+
self._last_alpha_seen = character
|
443 |
+
|
444 |
+
def reset(self) -> None: # pragma: no cover
|
445 |
+
self._character_count = 0
|
446 |
+
self._character_count_since_last_sep = 0
|
447 |
+
self._successive_upper_lower_count = 0
|
448 |
+
self._successive_upper_lower_count_final = 0
|
449 |
+
self._last_alpha_seen = None
|
450 |
+
self._buf = False
|
451 |
+
self._current_ascii_only = True
|
452 |
+
|
453 |
+
@property
|
454 |
+
def ratio(self) -> float:
|
455 |
+
if self._character_count == 0:
|
456 |
+
return 0.0
|
457 |
+
|
458 |
+
return self._successive_upper_lower_count_final / self._character_count
|
459 |
+
|
460 |
+
|
461 |
+
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
462 |
+
def __init__(self) -> None:
|
463 |
+
self._character_count: int = 0
|
464 |
+
self._isolated_form_count: int = 0
|
465 |
+
|
466 |
+
def reset(self) -> None: # pragma: no cover
|
467 |
+
self._character_count = 0
|
468 |
+
self._isolated_form_count = 0
|
469 |
+
|
470 |
+
def eligible(self, character: str) -> bool:
|
471 |
+
return is_arabic(character)
|
472 |
+
|
473 |
+
def feed(self, character: str) -> None:
|
474 |
+
self._character_count += 1
|
475 |
+
|
476 |
+
if is_arabic_isolated_form(character):
|
477 |
+
self._isolated_form_count += 1
|
478 |
+
|
479 |
+
@property
|
480 |
+
def ratio(self) -> float:
|
481 |
+
if self._character_count < 8:
|
482 |
+
return 0.0
|
483 |
+
|
484 |
+
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
485 |
+
|
486 |
+
return isolated_form_usage
|
487 |
+
|
488 |
+
|
489 |
+
@lru_cache(maxsize=1024)
|
490 |
+
def is_suspiciously_successive_range(
|
491 |
+
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
492 |
+
) -> bool:
|
493 |
+
"""
|
494 |
+
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
495 |
+
"""
|
496 |
+
if unicode_range_a is None or unicode_range_b is None:
|
497 |
+
return True
|
498 |
+
|
499 |
+
if unicode_range_a == unicode_range_b:
|
500 |
+
return False
|
501 |
+
|
502 |
+
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
503 |
+
return False
|
504 |
+
|
505 |
+
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
506 |
+
return False
|
507 |
+
|
508 |
+
# Latin characters can be accompanied with a combining diacritical mark
|
509 |
+
# eg. Vietnamese.
|
510 |
+
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
511 |
+
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
512 |
+
):
|
513 |
+
return False
|
514 |
+
|
515 |
+
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
516 |
+
" "
|
517 |
+
), unicode_range_b.split(" ")
|
518 |
+
|
519 |
+
for el in keywords_range_a:
|
520 |
+
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
521 |
+
continue
|
522 |
+
if el in keywords_range_b:
|
523 |
+
return False
|
524 |
+
|
525 |
+
# Japanese Exception
|
526 |
+
range_a_jp_chars, range_b_jp_chars = (
|
527 |
+
unicode_range_a
|
528 |
+
in (
|
529 |
+
"Hiragana",
|
530 |
+
"Katakana",
|
531 |
+
),
|
532 |
+
unicode_range_b in ("Hiragana", "Katakana"),
|
533 |
+
)
|
534 |
+
if (range_a_jp_chars or range_b_jp_chars) and (
|
535 |
+
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
536 |
+
):
|
537 |
+
return False
|
538 |
+
if range_a_jp_chars and range_b_jp_chars:
|
539 |
+
return False
|
540 |
+
|
541 |
+
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
542 |
+
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
543 |
+
return False
|
544 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
545 |
+
return False
|
546 |
+
|
547 |
+
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
548 |
+
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
549 |
+
unicode_range_a in ["Katakana", "Hiragana"]
|
550 |
+
and unicode_range_b in ["Katakana", "Hiragana"]
|
551 |
+
):
|
552 |
+
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
553 |
+
return False
|
554 |
+
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
555 |
+
return False
|
556 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
557 |
+
return False
|
558 |
+
|
559 |
+
return True
|
560 |
+
|
561 |
+
|
562 |
+
@lru_cache(maxsize=2048)
|
563 |
+
def mess_ratio(
|
564 |
+
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
565 |
+
) -> float:
|
566 |
+
"""
|
567 |
+
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
568 |
+
"""
|
569 |
+
|
570 |
+
detectors: List[MessDetectorPlugin] = [
|
571 |
+
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
572 |
+
]
|
573 |
+
|
574 |
+
length: int = len(decoded_sequence) + 1
|
575 |
+
|
576 |
+
mean_mess_ratio: float = 0.0
|
577 |
+
|
578 |
+
if length < 512:
|
579 |
+
intermediary_mean_mess_ratio_calc: int = 32
|
580 |
+
elif length <= 1024:
|
581 |
+
intermediary_mean_mess_ratio_calc = 64
|
582 |
+
else:
|
583 |
+
intermediary_mean_mess_ratio_calc = 128
|
584 |
+
|
585 |
+
for character, index in zip(decoded_sequence + "\n", range(length)):
|
586 |
+
for detector in detectors:
|
587 |
+
if detector.eligible(character):
|
588 |
+
detector.feed(character)
|
589 |
+
|
590 |
+
if (
|
591 |
+
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
592 |
+
) or index == length - 1:
|
593 |
+
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
594 |
+
|
595 |
+
if mean_mess_ratio >= maximum_threshold:
|
596 |
+
break
|
597 |
+
|
598 |
+
if debug:
|
599 |
+
logger = getLogger("charset_normalizer")
|
600 |
+
|
601 |
+
logger.log(
|
602 |
+
TRACE,
|
603 |
+
"Mess-detector extended-analysis start. "
|
604 |
+
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
605 |
+
f"maximum_threshold={maximum_threshold}",
|
606 |
+
)
|
607 |
+
|
608 |
+
if len(decoded_sequence) > 16:
|
609 |
+
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
610 |
+
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
611 |
+
|
612 |
+
for dt in detectors: # pragma: nocover
|
613 |
+
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
614 |
+
|
615 |
+
return round(mean_mess_ratio, 3)
|
lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-darwin.so
ADDED
Binary file (233 kB). View file
|
|
lib/python3.11/site-packages/charset_normalizer/models.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encodings.aliases import aliases
|
2 |
+
from hashlib import sha256
|
3 |
+
from json import dumps
|
4 |
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
5 |
+
|
6 |
+
from .constant import TOO_BIG_SEQUENCE
|
7 |
+
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
8 |
+
|
9 |
+
|
10 |
+
class CharsetMatch:
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
payload: bytes,
|
14 |
+
guessed_encoding: str,
|
15 |
+
mean_mess_ratio: float,
|
16 |
+
has_sig_or_bom: bool,
|
17 |
+
languages: "CoherenceMatches",
|
18 |
+
decoded_payload: Optional[str] = None,
|
19 |
+
):
|
20 |
+
self._payload: bytes = payload
|
21 |
+
|
22 |
+
self._encoding: str = guessed_encoding
|
23 |
+
self._mean_mess_ratio: float = mean_mess_ratio
|
24 |
+
self._languages: CoherenceMatches = languages
|
25 |
+
self._has_sig_or_bom: bool = has_sig_or_bom
|
26 |
+
self._unicode_ranges: Optional[List[str]] = None
|
27 |
+
|
28 |
+
self._leaves: List[CharsetMatch] = []
|
29 |
+
self._mean_coherence_ratio: float = 0.0
|
30 |
+
|
31 |
+
self._output_payload: Optional[bytes] = None
|
32 |
+
self._output_encoding: Optional[str] = None
|
33 |
+
|
34 |
+
self._string: Optional[str] = decoded_payload
|
35 |
+
|
36 |
+
def __eq__(self, other: object) -> bool:
|
37 |
+
if not isinstance(other, CharsetMatch):
|
38 |
+
raise TypeError(
|
39 |
+
"__eq__ cannot be invoked on {} and {}.".format(
|
40 |
+
str(other.__class__), str(self.__class__)
|
41 |
+
)
|
42 |
+
)
|
43 |
+
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
44 |
+
|
45 |
+
def __lt__(self, other: object) -> bool:
|
46 |
+
"""
|
47 |
+
Implemented to make sorted available upon CharsetMatches items.
|
48 |
+
"""
|
49 |
+
if not isinstance(other, CharsetMatch):
|
50 |
+
raise ValueError
|
51 |
+
|
52 |
+
chaos_difference: float = abs(self.chaos - other.chaos)
|
53 |
+
coherence_difference: float = abs(self.coherence - other.coherence)
|
54 |
+
|
55 |
+
# Below 1% difference --> Use Coherence
|
56 |
+
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
57 |
+
return self.coherence > other.coherence
|
58 |
+
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
59 |
+
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
60 |
+
# preserve RAM usage!
|
61 |
+
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
62 |
+
return self.chaos < other.chaos
|
63 |
+
return self.multi_byte_usage > other.multi_byte_usage
|
64 |
+
|
65 |
+
return self.chaos < other.chaos
|
66 |
+
|
67 |
+
@property
|
68 |
+
def multi_byte_usage(self) -> float:
|
69 |
+
return 1.0 - (len(str(self)) / len(self.raw))
|
70 |
+
|
71 |
+
def __str__(self) -> str:
|
72 |
+
# Lazy Str Loading
|
73 |
+
if self._string is None:
|
74 |
+
self._string = str(self._payload, self._encoding, "strict")
|
75 |
+
return self._string
|
76 |
+
|
77 |
+
def __repr__(self) -> str:
|
78 |
+
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
79 |
+
|
80 |
+
def add_submatch(self, other: "CharsetMatch") -> None:
|
81 |
+
if not isinstance(other, CharsetMatch) or other == self:
|
82 |
+
raise ValueError(
|
83 |
+
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
84 |
+
other.__class__
|
85 |
+
)
|
86 |
+
)
|
87 |
+
|
88 |
+
other._string = None # Unload RAM usage; dirty trick.
|
89 |
+
self._leaves.append(other)
|
90 |
+
|
91 |
+
@property
|
92 |
+
def encoding(self) -> str:
|
93 |
+
return self._encoding
|
94 |
+
|
95 |
+
@property
|
96 |
+
def encoding_aliases(self) -> List[str]:
|
97 |
+
"""
|
98 |
+
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
99 |
+
"""
|
100 |
+
also_known_as: List[str] = []
|
101 |
+
for u, p in aliases.items():
|
102 |
+
if self.encoding == u:
|
103 |
+
also_known_as.append(p)
|
104 |
+
elif self.encoding == p:
|
105 |
+
also_known_as.append(u)
|
106 |
+
return also_known_as
|
107 |
+
|
108 |
+
@property
|
109 |
+
def bom(self) -> bool:
|
110 |
+
return self._has_sig_or_bom
|
111 |
+
|
112 |
+
@property
|
113 |
+
def byte_order_mark(self) -> bool:
|
114 |
+
return self._has_sig_or_bom
|
115 |
+
|
116 |
+
@property
|
117 |
+
def languages(self) -> List[str]:
|
118 |
+
"""
|
119 |
+
Return the complete list of possible languages found in decoded sequence.
|
120 |
+
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
121 |
+
"""
|
122 |
+
return [e[0] for e in self._languages]
|
123 |
+
|
124 |
+
@property
|
125 |
+
def language(self) -> str:
|
126 |
+
"""
|
127 |
+
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
128 |
+
"Unknown".
|
129 |
+
"""
|
130 |
+
if not self._languages:
|
131 |
+
# Trying to infer the language based on the given encoding
|
132 |
+
# Its either English or we should not pronounce ourselves in certain cases.
|
133 |
+
if "ascii" in self.could_be_from_charset:
|
134 |
+
return "English"
|
135 |
+
|
136 |
+
# doing it there to avoid circular import
|
137 |
+
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
138 |
+
|
139 |
+
languages = (
|
140 |
+
mb_encoding_languages(self.encoding)
|
141 |
+
if is_multi_byte_encoding(self.encoding)
|
142 |
+
else encoding_languages(self.encoding)
|
143 |
+
)
|
144 |
+
|
145 |
+
if len(languages) == 0 or "Latin Based" in languages:
|
146 |
+
return "Unknown"
|
147 |
+
|
148 |
+
return languages[0]
|
149 |
+
|
150 |
+
return self._languages[0][0]
|
151 |
+
|
152 |
+
@property
|
153 |
+
def chaos(self) -> float:
|
154 |
+
return self._mean_mess_ratio
|
155 |
+
|
156 |
+
@property
|
157 |
+
def coherence(self) -> float:
|
158 |
+
if not self._languages:
|
159 |
+
return 0.0
|
160 |
+
return self._languages[0][1]
|
161 |
+
|
162 |
+
@property
|
163 |
+
def percent_chaos(self) -> float:
|
164 |
+
return round(self.chaos * 100, ndigits=3)
|
165 |
+
|
166 |
+
@property
|
167 |
+
def percent_coherence(self) -> float:
|
168 |
+
return round(self.coherence * 100, ndigits=3)
|
169 |
+
|
170 |
+
@property
|
171 |
+
def raw(self) -> bytes:
|
172 |
+
"""
|
173 |
+
Original untouched bytes.
|
174 |
+
"""
|
175 |
+
return self._payload
|
176 |
+
|
177 |
+
@property
|
178 |
+
def submatch(self) -> List["CharsetMatch"]:
|
179 |
+
return self._leaves
|
180 |
+
|
181 |
+
@property
|
182 |
+
def has_submatch(self) -> bool:
|
183 |
+
return len(self._leaves) > 0
|
184 |
+
|
185 |
+
@property
|
186 |
+
def alphabets(self) -> List[str]:
|
187 |
+
if self._unicode_ranges is not None:
|
188 |
+
return self._unicode_ranges
|
189 |
+
# list detected ranges
|
190 |
+
detected_ranges: List[Optional[str]] = [
|
191 |
+
unicode_range(char) for char in str(self)
|
192 |
+
]
|
193 |
+
# filter and sort
|
194 |
+
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
195 |
+
return self._unicode_ranges
|
196 |
+
|
197 |
+
@property
|
198 |
+
def could_be_from_charset(self) -> List[str]:
|
199 |
+
"""
|
200 |
+
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
201 |
+
encoding.
|
202 |
+
This list does include the encoding available in property 'encoding'.
|
203 |
+
"""
|
204 |
+
return [self._encoding] + [m.encoding for m in self._leaves]
|
205 |
+
|
206 |
+
def output(self, encoding: str = "utf_8") -> bytes:
|
207 |
+
"""
|
208 |
+
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
209 |
+
Any errors will be simply ignored by the encoder NOT replaced.
|
210 |
+
"""
|
211 |
+
if self._output_encoding is None or self._output_encoding != encoding:
|
212 |
+
self._output_encoding = encoding
|
213 |
+
self._output_payload = str(self).encode(encoding, "replace")
|
214 |
+
|
215 |
+
return self._output_payload # type: ignore
|
216 |
+
|
217 |
+
@property
|
218 |
+
def fingerprint(self) -> str:
|
219 |
+
"""
|
220 |
+
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
221 |
+
"""
|
222 |
+
return sha256(self.output()).hexdigest()
|
223 |
+
|
224 |
+
|
225 |
+
class CharsetMatches:
|
226 |
+
"""
|
227 |
+
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
228 |
+
Act like a list(iterable) but does not implements all related methods.
|
229 |
+
"""
|
230 |
+
|
231 |
+
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
232 |
+
self._results: List[CharsetMatch] = sorted(results) if results else []
|
233 |
+
|
234 |
+
def __iter__(self) -> Iterator[CharsetMatch]:
|
235 |
+
yield from self._results
|
236 |
+
|
237 |
+
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
238 |
+
"""
|
239 |
+
Retrieve a single item either by its position or encoding name (alias may be used here).
|
240 |
+
Raise KeyError upon invalid index or encoding not present in results.
|
241 |
+
"""
|
242 |
+
if isinstance(item, int):
|
243 |
+
return self._results[item]
|
244 |
+
if isinstance(item, str):
|
245 |
+
item = iana_name(item, False)
|
246 |
+
for result in self._results:
|
247 |
+
if item in result.could_be_from_charset:
|
248 |
+
return result
|
249 |
+
raise KeyError
|
250 |
+
|
251 |
+
def __len__(self) -> int:
|
252 |
+
return len(self._results)
|
253 |
+
|
254 |
+
def __bool__(self) -> bool:
|
255 |
+
return len(self._results) > 0
|
256 |
+
|
257 |
+
def append(self, item: CharsetMatch) -> None:
|
258 |
+
"""
|
259 |
+
Insert a single match. Will be inserted accordingly to preserve sort.
|
260 |
+
Can be inserted as a submatch.
|
261 |
+
"""
|
262 |
+
if not isinstance(item, CharsetMatch):
|
263 |
+
raise ValueError(
|
264 |
+
"Cannot append instance '{}' to CharsetMatches".format(
|
265 |
+
str(item.__class__)
|
266 |
+
)
|
267 |
+
)
|
268 |
+
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
269 |
+
if len(item.raw) <= TOO_BIG_SEQUENCE:
|
270 |
+
for match in self._results:
|
271 |
+
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
272 |
+
match.add_submatch(item)
|
273 |
+
return
|
274 |
+
self._results.append(item)
|
275 |
+
self._results = sorted(self._results)
|
276 |
+
|
277 |
+
def best(self) -> Optional["CharsetMatch"]:
|
278 |
+
"""
|
279 |
+
Simply return the first match. Strict equivalent to matches[0].
|
280 |
+
"""
|
281 |
+
if not self._results:
|
282 |
+
return None
|
283 |
+
return self._results[0]
|
284 |
+
|
285 |
+
def first(self) -> Optional["CharsetMatch"]:
|
286 |
+
"""
|
287 |
+
Redundant method, call the method best(). Kept for BC reasons.
|
288 |
+
"""
|
289 |
+
return self.best()
|
290 |
+
|
291 |
+
|
292 |
+
CoherenceMatch = Tuple[str, float]
|
293 |
+
CoherenceMatches = List[CoherenceMatch]
|
294 |
+
|
295 |
+
|
296 |
+
class CliDetectionResult:
|
297 |
+
def __init__(
|
298 |
+
self,
|
299 |
+
path: str,
|
300 |
+
encoding: Optional[str],
|
301 |
+
encoding_aliases: List[str],
|
302 |
+
alternative_encodings: List[str],
|
303 |
+
language: str,
|
304 |
+
alphabets: List[str],
|
305 |
+
has_sig_or_bom: bool,
|
306 |
+
chaos: float,
|
307 |
+
coherence: float,
|
308 |
+
unicode_path: Optional[str],
|
309 |
+
is_preferred: bool,
|
310 |
+
):
|
311 |
+
self.path: str = path
|
312 |
+
self.unicode_path: Optional[str] = unicode_path
|
313 |
+
self.encoding: Optional[str] = encoding
|
314 |
+
self.encoding_aliases: List[str] = encoding_aliases
|
315 |
+
self.alternative_encodings: List[str] = alternative_encodings
|
316 |
+
self.language: str = language
|
317 |
+
self.alphabets: List[str] = alphabets
|
318 |
+
self.has_sig_or_bom: bool = has_sig_or_bom
|
319 |
+
self.chaos: float = chaos
|
320 |
+
self.coherence: float = coherence
|
321 |
+
self.is_preferred: bool = is_preferred
|
322 |
+
|
323 |
+
@property
|
324 |
+
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
325 |
+
return {
|
326 |
+
"path": self.path,
|
327 |
+
"encoding": self.encoding,
|
328 |
+
"encoding_aliases": self.encoding_aliases,
|
329 |
+
"alternative_encodings": self.alternative_encodings,
|
330 |
+
"language": self.language,
|
331 |
+
"alphabets": self.alphabets,
|
332 |
+
"has_sig_or_bom": self.has_sig_or_bom,
|
333 |
+
"chaos": self.chaos,
|
334 |
+
"coherence": self.coherence,
|
335 |
+
"unicode_path": self.unicode_path,
|
336 |
+
"is_preferred": self.is_preferred,
|
337 |
+
}
|
338 |
+
|
339 |
+
def to_json(self) -> str:
|
340 |
+
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
lib/python3.11/site-packages/charset_normalizer/py.typed
ADDED
File without changes
|
lib/python3.11/site-packages/charset_normalizer/utils.py
ADDED
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
import logging
|
3 |
+
import unicodedata
|
4 |
+
from codecs import IncrementalDecoder
|
5 |
+
from encodings.aliases import aliases
|
6 |
+
from functools import lru_cache
|
7 |
+
from re import findall
|
8 |
+
from typing import Generator, List, Optional, Set, Tuple, Union
|
9 |
+
|
10 |
+
from _multibytecodec import MultibyteIncrementalDecoder
|
11 |
+
|
12 |
+
from .constant import (
|
13 |
+
ENCODING_MARKS,
|
14 |
+
IANA_SUPPORTED_SIMILAR,
|
15 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
16 |
+
UNICODE_RANGES_COMBINED,
|
17 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
18 |
+
UTF8_MAXIMAL_ALLOCATION,
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
23 |
+
def is_accentuated(character: str) -> bool:
|
24 |
+
try:
|
25 |
+
description: str = unicodedata.name(character)
|
26 |
+
except ValueError:
|
27 |
+
return False
|
28 |
+
return (
|
29 |
+
"WITH GRAVE" in description
|
30 |
+
or "WITH ACUTE" in description
|
31 |
+
or "WITH CEDILLA" in description
|
32 |
+
or "WITH DIAERESIS" in description
|
33 |
+
or "WITH CIRCUMFLEX" in description
|
34 |
+
or "WITH TILDE" in description
|
35 |
+
or "WITH MACRON" in description
|
36 |
+
or "WITH RING ABOVE" in description
|
37 |
+
)
|
38 |
+
|
39 |
+
|
40 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
41 |
+
def remove_accent(character: str) -> str:
|
42 |
+
decomposed: str = unicodedata.decomposition(character)
|
43 |
+
if not decomposed:
|
44 |
+
return character
|
45 |
+
|
46 |
+
codes: List[str] = decomposed.split(" ")
|
47 |
+
|
48 |
+
return chr(int(codes[0], 16))
|
49 |
+
|
50 |
+
|
51 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
52 |
+
def unicode_range(character: str) -> Optional[str]:
|
53 |
+
"""
|
54 |
+
Retrieve the Unicode range official name from a single character.
|
55 |
+
"""
|
56 |
+
character_ord: int = ord(character)
|
57 |
+
|
58 |
+
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
59 |
+
if character_ord in ord_range:
|
60 |
+
return range_name
|
61 |
+
|
62 |
+
return None
|
63 |
+
|
64 |
+
|
65 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
66 |
+
def is_latin(character: str) -> bool:
|
67 |
+
try:
|
68 |
+
description: str = unicodedata.name(character)
|
69 |
+
except ValueError:
|
70 |
+
return False
|
71 |
+
return "LATIN" in description
|
72 |
+
|
73 |
+
|
74 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
75 |
+
def is_punctuation(character: str) -> bool:
|
76 |
+
character_category: str = unicodedata.category(character)
|
77 |
+
|
78 |
+
if "P" in character_category:
|
79 |
+
return True
|
80 |
+
|
81 |
+
character_range: Optional[str] = unicode_range(character)
|
82 |
+
|
83 |
+
if character_range is None:
|
84 |
+
return False
|
85 |
+
|
86 |
+
return "Punctuation" in character_range
|
87 |
+
|
88 |
+
|
89 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
90 |
+
def is_symbol(character: str) -> bool:
|
91 |
+
character_category: str = unicodedata.category(character)
|
92 |
+
|
93 |
+
if "S" in character_category or "N" in character_category:
|
94 |
+
return True
|
95 |
+
|
96 |
+
character_range: Optional[str] = unicode_range(character)
|
97 |
+
|
98 |
+
if character_range is None:
|
99 |
+
return False
|
100 |
+
|
101 |
+
return "Forms" in character_range and character_category != "Lo"
|
102 |
+
|
103 |
+
|
104 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
105 |
+
def is_emoticon(character: str) -> bool:
|
106 |
+
character_range: Optional[str] = unicode_range(character)
|
107 |
+
|
108 |
+
if character_range is None:
|
109 |
+
return False
|
110 |
+
|
111 |
+
return "Emoticons" in character_range or "Pictographs" in character_range
|
112 |
+
|
113 |
+
|
114 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
115 |
+
def is_separator(character: str) -> bool:
|
116 |
+
if character.isspace() or character in {"|", "+", "<", ">"}:
|
117 |
+
return True
|
118 |
+
|
119 |
+
character_category: str = unicodedata.category(character)
|
120 |
+
|
121 |
+
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
122 |
+
|
123 |
+
|
124 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
125 |
+
def is_case_variable(character: str) -> bool:
|
126 |
+
return character.islower() != character.isupper()
|
127 |
+
|
128 |
+
|
129 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
130 |
+
def is_cjk(character: str) -> bool:
|
131 |
+
try:
|
132 |
+
character_name = unicodedata.name(character)
|
133 |
+
except ValueError:
|
134 |
+
return False
|
135 |
+
|
136 |
+
return "CJK" in character_name
|
137 |
+
|
138 |
+
|
139 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
140 |
+
def is_hiragana(character: str) -> bool:
|
141 |
+
try:
|
142 |
+
character_name = unicodedata.name(character)
|
143 |
+
except ValueError:
|
144 |
+
return False
|
145 |
+
|
146 |
+
return "HIRAGANA" in character_name
|
147 |
+
|
148 |
+
|
149 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
150 |
+
def is_katakana(character: str) -> bool:
|
151 |
+
try:
|
152 |
+
character_name = unicodedata.name(character)
|
153 |
+
except ValueError:
|
154 |
+
return False
|
155 |
+
|
156 |
+
return "KATAKANA" in character_name
|
157 |
+
|
158 |
+
|
159 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
160 |
+
def is_hangul(character: str) -> bool:
|
161 |
+
try:
|
162 |
+
character_name = unicodedata.name(character)
|
163 |
+
except ValueError:
|
164 |
+
return False
|
165 |
+
|
166 |
+
return "HANGUL" in character_name
|
167 |
+
|
168 |
+
|
169 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
170 |
+
def is_thai(character: str) -> bool:
|
171 |
+
try:
|
172 |
+
character_name = unicodedata.name(character)
|
173 |
+
except ValueError:
|
174 |
+
return False
|
175 |
+
|
176 |
+
return "THAI" in character_name
|
177 |
+
|
178 |
+
|
179 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
180 |
+
def is_arabic(character: str) -> bool:
|
181 |
+
try:
|
182 |
+
character_name = unicodedata.name(character)
|
183 |
+
except ValueError:
|
184 |
+
return False
|
185 |
+
|
186 |
+
return "ARABIC" in character_name
|
187 |
+
|
188 |
+
|
189 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
190 |
+
def is_arabic_isolated_form(character: str) -> bool:
|
191 |
+
try:
|
192 |
+
character_name = unicodedata.name(character)
|
193 |
+
except ValueError:
|
194 |
+
return False
|
195 |
+
|
196 |
+
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
197 |
+
|
198 |
+
|
199 |
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
200 |
+
def is_unicode_range_secondary(range_name: str) -> bool:
|
201 |
+
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
202 |
+
|
203 |
+
|
204 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
205 |
+
def is_unprintable(character: str) -> bool:
|
206 |
+
return (
|
207 |
+
character.isspace() is False # includes \n \t \r \v
|
208 |
+
and character.isprintable() is False
|
209 |
+
and character != "\x1A" # Why? Its the ASCII substitute character.
|
210 |
+
and character != "\ufeff" # bug discovered in Python,
|
211 |
+
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
212 |
+
)
|
213 |
+
|
214 |
+
|
215 |
+
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
|
216 |
+
"""
|
217 |
+
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
218 |
+
"""
|
219 |
+
if not isinstance(sequence, bytes):
|
220 |
+
raise TypeError
|
221 |
+
|
222 |
+
seq_len: int = len(sequence)
|
223 |
+
|
224 |
+
results: List[str] = findall(
|
225 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
226 |
+
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
227 |
+
)
|
228 |
+
|
229 |
+
if len(results) == 0:
|
230 |
+
return None
|
231 |
+
|
232 |
+
for specified_encoding in results:
|
233 |
+
specified_encoding = specified_encoding.lower().replace("-", "_")
|
234 |
+
|
235 |
+
encoding_alias: str
|
236 |
+
encoding_iana: str
|
237 |
+
|
238 |
+
for encoding_alias, encoding_iana in aliases.items():
|
239 |
+
if encoding_alias == specified_encoding:
|
240 |
+
return encoding_iana
|
241 |
+
if encoding_iana == specified_encoding:
|
242 |
+
return encoding_iana
|
243 |
+
|
244 |
+
return None
|
245 |
+
|
246 |
+
|
247 |
+
@lru_cache(maxsize=128)
|
248 |
+
def is_multi_byte_encoding(name: str) -> bool:
|
249 |
+
"""
|
250 |
+
Verify is a specific encoding is a multi byte one based on it IANA name
|
251 |
+
"""
|
252 |
+
return name in {
|
253 |
+
"utf_8",
|
254 |
+
"utf_8_sig",
|
255 |
+
"utf_16",
|
256 |
+
"utf_16_be",
|
257 |
+
"utf_16_le",
|
258 |
+
"utf_32",
|
259 |
+
"utf_32_le",
|
260 |
+
"utf_32_be",
|
261 |
+
"utf_7",
|
262 |
+
} or issubclass(
|
263 |
+
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
264 |
+
MultibyteIncrementalDecoder,
|
265 |
+
)
|
266 |
+
|
267 |
+
|
268 |
+
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
269 |
+
"""
|
270 |
+
Identify and extract SIG/BOM in given sequence.
|
271 |
+
"""
|
272 |
+
|
273 |
+
for iana_encoding in ENCODING_MARKS:
|
274 |
+
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
275 |
+
|
276 |
+
if isinstance(marks, bytes):
|
277 |
+
marks = [marks]
|
278 |
+
|
279 |
+
for mark in marks:
|
280 |
+
if sequence.startswith(mark):
|
281 |
+
return iana_encoding, mark
|
282 |
+
|
283 |
+
return None, b""
|
284 |
+
|
285 |
+
|
286 |
+
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
287 |
+
return iana_encoding not in {"utf_16", "utf_32"}
|
288 |
+
|
289 |
+
|
290 |
+
def iana_name(cp_name: str, strict: bool = True) -> str:
|
291 |
+
cp_name = cp_name.lower().replace("-", "_")
|
292 |
+
|
293 |
+
encoding_alias: str
|
294 |
+
encoding_iana: str
|
295 |
+
|
296 |
+
for encoding_alias, encoding_iana in aliases.items():
|
297 |
+
if cp_name in [encoding_alias, encoding_iana]:
|
298 |
+
return encoding_iana
|
299 |
+
|
300 |
+
if strict:
|
301 |
+
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
302 |
+
|
303 |
+
return cp_name
|
304 |
+
|
305 |
+
|
306 |
+
def range_scan(decoded_sequence: str) -> List[str]:
|
307 |
+
ranges: Set[str] = set()
|
308 |
+
|
309 |
+
for character in decoded_sequence:
|
310 |
+
character_range: Optional[str] = unicode_range(character)
|
311 |
+
|
312 |
+
if character_range is None:
|
313 |
+
continue
|
314 |
+
|
315 |
+
ranges.add(character_range)
|
316 |
+
|
317 |
+
return list(ranges)
|
318 |
+
|
319 |
+
|
320 |
+
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
321 |
+
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
322 |
+
return 0.0
|
323 |
+
|
324 |
+
decoder_a = importlib.import_module(
|
325 |
+
"encodings.{}".format(iana_name_a)
|
326 |
+
).IncrementalDecoder
|
327 |
+
decoder_b = importlib.import_module(
|
328 |
+
"encodings.{}".format(iana_name_b)
|
329 |
+
).IncrementalDecoder
|
330 |
+
|
331 |
+
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
332 |
+
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
333 |
+
|
334 |
+
character_match_count: int = 0
|
335 |
+
|
336 |
+
for i in range(255):
|
337 |
+
to_be_decoded: bytes = bytes([i])
|
338 |
+
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
339 |
+
character_match_count += 1
|
340 |
+
|
341 |
+
return character_match_count / 254
|
342 |
+
|
343 |
+
|
344 |
+
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
345 |
+
"""
|
346 |
+
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
347 |
+
the function cp_similarity.
|
348 |
+
"""
|
349 |
+
return (
|
350 |
+
iana_name_a in IANA_SUPPORTED_SIMILAR
|
351 |
+
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
352 |
+
)
|
353 |
+
|
354 |
+
|
355 |
+
def set_logging_handler(
|
356 |
+
name: str = "charset_normalizer",
|
357 |
+
level: int = logging.INFO,
|
358 |
+
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
359 |
+
) -> None:
|
360 |
+
logger = logging.getLogger(name)
|
361 |
+
logger.setLevel(level)
|
362 |
+
|
363 |
+
handler = logging.StreamHandler()
|
364 |
+
handler.setFormatter(logging.Formatter(format_string))
|
365 |
+
logger.addHandler(handler)
|
366 |
+
|
367 |
+
|
368 |
+
def cut_sequence_chunks(
|
369 |
+
sequences: bytes,
|
370 |
+
encoding_iana: str,
|
371 |
+
offsets: range,
|
372 |
+
chunk_size: int,
|
373 |
+
bom_or_sig_available: bool,
|
374 |
+
strip_sig_or_bom: bool,
|
375 |
+
sig_payload: bytes,
|
376 |
+
is_multi_byte_decoder: bool,
|
377 |
+
decoded_payload: Optional[str] = None,
|
378 |
+
) -> Generator[str, None, None]:
|
379 |
+
if decoded_payload and is_multi_byte_decoder is False:
|
380 |
+
for i in offsets:
|
381 |
+
chunk = decoded_payload[i : i + chunk_size]
|
382 |
+
if not chunk:
|
383 |
+
break
|
384 |
+
yield chunk
|
385 |
+
else:
|
386 |
+
for i in offsets:
|
387 |
+
chunk_end = i + chunk_size
|
388 |
+
if chunk_end > len(sequences) + 8:
|
389 |
+
continue
|
390 |
+
|
391 |
+
cut_sequence = sequences[i : i + chunk_size]
|
392 |
+
|
393 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
394 |
+
cut_sequence = sig_payload + cut_sequence
|
395 |
+
|
396 |
+
chunk = cut_sequence.decode(
|
397 |
+
encoding_iana,
|
398 |
+
errors="ignore" if is_multi_byte_decoder else "strict",
|
399 |
+
)
|
400 |
+
|
401 |
+
# multi-byte bad cutting detector and adjustment
|
402 |
+
# not the cleanest way to perform that fix but clever enough for now.
|
403 |
+
if is_multi_byte_decoder and i > 0:
|
404 |
+
chunk_partial_size_chk: int = min(chunk_size, 16)
|
405 |
+
|
406 |
+
if (
|
407 |
+
decoded_payload
|
408 |
+
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
409 |
+
):
|
410 |
+
for j in range(i, i - 4, -1):
|
411 |
+
cut_sequence = sequences[j:chunk_end]
|
412 |
+
|
413 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
414 |
+
cut_sequence = sig_payload + cut_sequence
|
415 |
+
|
416 |
+
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
417 |
+
|
418 |
+
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
419 |
+
break
|
420 |
+
|
421 |
+
yield chunk
|
lib/python3.11/site-packages/charset_normalizer/version.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Expose version
|
3 |
+
"""
|
4 |
+
|
5 |
+
__version__ = "3.3.2"
|
6 |
+
VERSION = __version__.split(".")
|
lib/python3.11/site-packages/distutils-precedence.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2638ce9e2500e572a5e0de7faed6661eb569d1b696fcba07b0dd223da5f5d224
|
3 |
+
size 151
|
lib/python3.11/site-packages/filelock/__init__.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A platform independent file lock that supports the with-statement.
|
3 |
+
|
4 |
+
.. autodata:: filelock.__version__
|
5 |
+
:no-value:
|
6 |
+
|
7 |
+
"""
|
8 |
+
from __future__ import annotations
|
9 |
+
|
10 |
+
import sys
|
11 |
+
import warnings
|
12 |
+
from typing import TYPE_CHECKING
|
13 |
+
|
14 |
+
from ._api import AcquireReturnProxy, BaseFileLock
|
15 |
+
from ._error import Timeout
|
16 |
+
from ._soft import SoftFileLock
|
17 |
+
from ._unix import UnixFileLock, has_fcntl
|
18 |
+
from ._windows import WindowsFileLock
|
19 |
+
from .version import version
|
20 |
+
|
21 |
+
#: version of the project as a string
|
22 |
+
__version__: str = version
|
23 |
+
|
24 |
+
|
25 |
+
if sys.platform == "win32": # pragma: win32 cover
|
26 |
+
_FileLock: type[BaseFileLock] = WindowsFileLock
|
27 |
+
else: # pragma: win32 no cover # noqa: PLR5501
|
28 |
+
if has_fcntl:
|
29 |
+
_FileLock: type[BaseFileLock] = UnixFileLock
|
30 |
+
else:
|
31 |
+
_FileLock = SoftFileLock
|
32 |
+
if warnings is not None:
|
33 |
+
warnings.warn("only soft file lock is available", stacklevel=2)
|
34 |
+
|
35 |
+
if TYPE_CHECKING:
|
36 |
+
FileLock = SoftFileLock
|
37 |
+
else:
|
38 |
+
#: Alias for the lock, which should be used for the current platform.
|
39 |
+
FileLock = _FileLock
|
40 |
+
|
41 |
+
|
42 |
+
__all__ = [
|
43 |
+
"__version__",
|
44 |
+
"FileLock",
|
45 |
+
"SoftFileLock",
|
46 |
+
"Timeout",
|
47 |
+
"UnixFileLock",
|
48 |
+
"WindowsFileLock",
|
49 |
+
"BaseFileLock",
|
50 |
+
"AcquireReturnProxy",
|
51 |
+
]
|
lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (1.45 kB). View file
|
|
lib/python3.11/site-packages/filelock/__pycache__/_api.cpython-311.pyc
ADDED
Binary file (14.6 kB). View file
|
|
lib/python3.11/site-packages/filelock/__pycache__/_error.cpython-311.pyc
ADDED
Binary file (1.98 kB). View file
|
|
lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc
ADDED
Binary file (2.73 kB). View file
|
|