Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 2,265 Bytes

d916065

import os
from io import BytesIO

import pytest

from nltk.corpus.reader import SeekableUnicodeStreamReader


def check_reader(unicode_string, encoding):
    bytestr = unicode_string.encode(encoding)
    stream = BytesIO(bytestr)
    reader = SeekableUnicodeStreamReader(stream, encoding)

    # Should open at the start of the file
    assert reader.tell() == 0

    # Compare original string to contents from `.readlines()`
    assert unicode_string == "".join(reader.readlines())

    # Should be at the end of the file now
    stream.seek(0, os.SEEK_END)
    assert reader.tell() == stream.tell()

    reader.seek(0)  # go back to start

    # Compare original string to contents from `.read()`
    contents = ""
    char = None
    while char != "":
        char = reader.read(1)
        contents += char
    assert unicode_string == contents


# Call `check_reader` with a variety of input strings and encodings.
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]

STRINGS = [
    """

    This is a test file.

    It is fairly short.

    """,
    "This file can be encoded with latin1. \x83",
    """\

    This is a test file.

    Here's a blank line:



    And here's some unicode: \xee \u0123 \uffe3

    """,
    """\

    This is a test file.

    Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555

    """,
    """\

    This is a larger file.  It has some lines that are longer \

    than 72 characters.  It's got lots of repetition.  Here's \

    some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345



    How fun!  Let's repeat it twenty times.

    """
    * 20,
]


@pytest.mark.parametrize("string", STRINGS)
def test_reader(string):
    for encoding in ENCODINGS:
        # skip strings that can't be encoded with the current encoding
        try:
            string.encode(encoding)
        except UnicodeEncodeError:
            continue
        check_reader(string, encoding)


def test_reader_stream_closes_when_deleted():
    reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
    assert not reader.stream.closed
    reader.__del__()
    assert reader.stream.closed


def teardown_module(module=None):
    import gc

    gc.collect()