Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /test /unit /test_seekable_unicode_stream_reader.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

2.27 kB

	import os
	from io import BytesIO

	import pytest

	from nltk.corpus.reader import SeekableUnicodeStreamReader


	def check_reader(unicode_string, encoding):
	bytestr = unicode_string.encode(encoding)
	stream = BytesIO(bytestr)
	reader = SeekableUnicodeStreamReader(stream, encoding)

	# Should open at the start of the file
	assert reader.tell() == 0

	# Compare original string to contents from `.readlines()`
	assert unicode_string == "".join(reader.readlines())

	# Should be at the end of the file now
	stream.seek(0, os.SEEK_END)
	assert reader.tell() == stream.tell()

	reader.seek(0) # go back to start

	# Compare original string to contents from `.read()`
	contents = ""
	char = None
	while char != "":
	char = reader.read(1)
	contents += char
	assert unicode_string == contents


	# Call `check_reader` with a variety of input strings and encodings.
	ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]

	STRINGS = [
	"""
	This is a test file.
	It is fairly short.
	""",
	"This file can be encoded with latin1. \x83",
	"""\
	This is a test file.
	Here's a blank line:

	And here's some unicode: \xee \u0123 \uffe3
	""",
	"""\
	This is a test file.
	Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
	""",
	"""\
	This is a larger file. It has some lines that are longer \
	than 72 characters. It's got lots of repetition. Here's \
	some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345

	How fun! Let's repeat it twenty times.
	"""
	* 20,
	]


	@pytest.mark.parametrize("string", STRINGS)
	def test_reader(string):
	for encoding in ENCODINGS:
	# skip strings that can't be encoded with the current encoding
	try:
	string.encode(encoding)
	except UnicodeEncodeError:
	continue
	check_reader(string, encoding)


	def test_reader_stream_closes_when_deleted():
	reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
	assert not reader.stream.closed
	reader.__del__()
	assert reader.stream.closed


	def teardown_module(module=None):
	import gc

	gc.collect()