EasyDetect / pipeline /nltk /test /unit /test_seekable_unicode_stream_reader.py
sunnychenxiwang's picture
update nltk
d916065
raw
history blame
2.27 kB
import os
from io import BytesIO
import pytest
from nltk.corpus.reader import SeekableUnicodeStreamReader
def check_reader(unicode_string, encoding):
bytestr = unicode_string.encode(encoding)
stream = BytesIO(bytestr)
reader = SeekableUnicodeStreamReader(stream, encoding)
# Should open at the start of the file
assert reader.tell() == 0
# Compare original string to contents from `.readlines()`
assert unicode_string == "".join(reader.readlines())
# Should be at the end of the file now
stream.seek(0, os.SEEK_END)
assert reader.tell() == stream.tell()
reader.seek(0) # go back to start
# Compare original string to contents from `.read()`
contents = ""
char = None
while char != "":
char = reader.read(1)
contents += char
assert unicode_string == contents
# Call `check_reader` with a variety of input strings and encodings.
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]
STRINGS = [
"""
This is a test file.
It is fairly short.
""",
"This file can be encoded with latin1. \x83",
"""\
This is a test file.
Here's a blank line:
And here's some unicode: \xee \u0123 \uffe3
""",
"""\
This is a test file.
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
""",
"""\
This is a larger file. It has some lines that are longer \
than 72 characters. It's got lots of repetition. Here's \
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
How fun! Let's repeat it twenty times.
"""
* 20,
]
@pytest.mark.parametrize("string", STRINGS)
def test_reader(string):
for encoding in ENCODINGS:
# skip strings that can't be encoded with the current encoding
try:
string.encode(encoding)
except UnicodeEncodeError:
continue
check_reader(string, encoding)
def test_reader_stream_closes_when_deleted():
reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
assert not reader.stream.closed
reader.__del__()
assert reader.stream.closed
def teardown_module(module=None):
import gc
gc.collect()