Spaces:
Sleeping
Sleeping
import os | |
from io import BytesIO | |
import pytest | |
from nltk.corpus.reader import SeekableUnicodeStreamReader | |
def check_reader(unicode_string, encoding): | |
bytestr = unicode_string.encode(encoding) | |
stream = BytesIO(bytestr) | |
reader = SeekableUnicodeStreamReader(stream, encoding) | |
# Should open at the start of the file | |
assert reader.tell() == 0 | |
# Compare original string to contents from `.readlines()` | |
assert unicode_string == "".join(reader.readlines()) | |
# Should be at the end of the file now | |
stream.seek(0, os.SEEK_END) | |
assert reader.tell() == stream.tell() | |
reader.seek(0) # go back to start | |
# Compare original string to contents from `.read()` | |
contents = "" | |
char = None | |
while char != "": | |
char = reader.read(1) | |
contents += char | |
assert unicode_string == contents | |
# Call `check_reader` with a variety of input strings and encodings. | |
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"] | |
STRINGS = [ | |
""" | |
This is a test file. | |
It is fairly short. | |
""", | |
"This file can be encoded with latin1. \x83", | |
"""\ | |
This is a test file. | |
Here's a blank line: | |
And here's some unicode: \xee \u0123 \uffe3 | |
""", | |
"""\ | |
This is a test file. | |
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555 | |
""", | |
"""\ | |
This is a larger file. It has some lines that are longer \ | |
than 72 characters. It's got lots of repetition. Here's \ | |
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345 | |
How fun! Let's repeat it twenty times. | |
""" | |
* 20, | |
] | |
def test_reader(string): | |
for encoding in ENCODINGS: | |
# skip strings that can't be encoded with the current encoding | |
try: | |
string.encode(encoding) | |
except UnicodeEncodeError: | |
continue | |
check_reader(string, encoding) | |
def test_reader_stream_closes_when_deleted(): | |
reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii") | |
assert not reader.stream.closed | |
reader.__del__() | |
assert reader.stream.closed | |
def teardown_module(module=None): | |
import gc | |
gc.collect() | |