Spaces:

cpt-subtext
/

learngradio

Sleeping

Jannis Wienert

try out with a model

d5dce88 about 1 year ago

1.17 kB

	import codecs
	import locale
	import re
	import sys
	from typing import List, Tuple

	BOMS: List[Tuple[bytes, str]] = [
	(codecs.BOM_UTF8, "utf-8"),
	(codecs.BOM_UTF16, "utf-16"),
	(codecs.BOM_UTF16_BE, "utf-16-be"),
	(codecs.BOM_UTF16_LE, "utf-16-le"),
	(codecs.BOM_UTF32, "utf-32"),
	(codecs.BOM_UTF32_BE, "utf-32-be"),
	(codecs.BOM_UTF32_LE, "utf-32-le"),
	]

	ENCODING_RE = re.compile(br"coding[:=]\s*([-\w.]+)")


	def auto_decode(data: bytes) -> str:
	"""Check a bytes string for a BOM to correctly detect the encoding

	Fallback to locale.getpreferredencoding(False) like open() on Python3"""
	for bom, encoding in BOMS:
	if data.startswith(bom):
	return data[len(bom) :].decode(encoding)
	# Lets check the first two lines as in PEP263
	for line in data.split(b"\n")[:2]:
	if line[0:1] == b"#" and ENCODING_RE.search(line):
	result = ENCODING_RE.search(line)
	assert result is not None
	encoding = result.groups()[0].decode("ascii")
	return data.decode(encoding)
	return data.decode(
	locale.getpreferredencoding(False) or sys.getdefaultencoding(),
	)