Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /tokenize /api.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

2.36 kB

	# Natural Language Toolkit: Tokenizer Interface
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# Steven Bird <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	Tokenizer Interface
	"""

	from abc import ABC, abstractmethod
	from typing import Iterator, List, Tuple

	from nltk.internals import overridden
	from nltk.tokenize.util import string_span_tokenize


	class TokenizerI(ABC):
	"""
	A processing interface for tokenizing a string.
	Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
	"""

	@abstractmethod
	def tokenize(self, s: str) -> List[str]:
	"""
	Return a tokenized copy of s.

	:rtype: List[str]
	"""
	if overridden(self.tokenize_sents):
	return self.tokenize_sents([s])[0]

	def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
	"""
	Identify the tokens using integer offsets ``(start_i, end_i)``,
	where ``s[start_i:end_i]`` is the corresponding token.

	:rtype: Iterator[Tuple[int, int]]
	"""
	raise NotImplementedError()

	def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
	"""
	Apply ``self.tokenize()`` to each element of ``strings``. I.e.:

	return [self.tokenize(s) for s in strings]

	:rtype: List[List[str]]
	"""
	return [self.tokenize(s) for s in strings]

	def span_tokenize_sents(
	self, strings: List[str]
	) -> Iterator[List[Tuple[int, int]]]:
	"""
	Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:

	return [self.span_tokenize(s) for s in strings]

	:yield: List[Tuple[int, int]]
	"""
	for s in strings:
	yield list(self.span_tokenize(s))


	class StringTokenizer(TokenizerI):
	"""A tokenizer that divides a string into substrings by splitting
	on the specified string (defined in subclasses).
	"""

	@property
	@abstractmethod
	def _string(self):
	raise NotImplementedError

	def tokenize(self, s):
	return s.split(self._string)

	def span_tokenize(self, s):
	yield from string_span_tokenize(s, self._string)