Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /reader /twitter.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

4.61 kB

	# Natural Language Toolkit: Twitter Corpus Reader
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Ewan Klein <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	A reader for corpora that consist of Tweets. It is assumed that the Tweets
	have been serialised into line-delimited JSON.
	"""

	import json
	import os

	from nltk.corpus.reader.api import CorpusReader
	from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
	from nltk.tokenize import TweetTokenizer


	class TwitterCorpusReader(CorpusReader):
	r"""
	Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

	Individual Tweets can be tokenized using the default tokenizer, or by a
	custom tokenizer specified as a parameter to the constructor.

	Construct a new Tweet corpus reader for a set of documents
	located at the given root directory.

	If you made your own tweet collection in a directory called
	`twitter-files`, then you can initialise the reader as::

	from nltk.corpus import TwitterCorpusReader
	reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

	However, the recommended approach is to set the relevant directory as the
	value of the environmental variable `TWITTER`, and then invoke the reader
	as follows::

	root = os.environ['TWITTER']
	reader = TwitterCorpusReader(root, '.*\.json')

	If you want to work directly with the raw Tweets, the `json` library can
	be used::

	import json
	for tweet in reader.docs():
	print(json.dumps(tweet, indent=1, sort_keys=True))

	"""

	CorpusView = StreamBackedCorpusView
	"""
	The corpus view class used by this reader.
	"""

	def __init__(
	self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
	):
	"""
	:param root: The root directory for this corpus.
	:param fileids: A list or regexp specifying the fileids in this corpus.
	:param word_tokenizer: Tokenizer for breaking the text of Tweets into
	smaller units, including but not limited to words.
	"""
	CorpusReader.__init__(self, root, fileids, encoding)

	for path in self.abspaths(self._fileids):
	if isinstance(path, ZipFilePathPointer):
	pass
	elif os.path.getsize(path) == 0:
	raise ValueError(f"File {path} is empty")
	"""Check that all user-created corpus files are non-empty."""

	self._word_tokenizer = word_tokenizer

	def docs(self, fileids=None):
	"""
	Returns the full Tweet objects, as specified by `Twitter
	documentation on Tweets
	<https://dev.twitter.com/docs/platform-objects/tweets>`_

	:return: the given file(s) as a list of dictionaries deserialised
	from JSON.
	:rtype: list(dict)
	"""
	return concat(
	[
	self.CorpusView(path, self._read_tweets, encoding=enc)
	for (path, enc, fileid) in self.abspaths(fileids, True, True)
	]
	)

	def strings(self, fileids=None):
	"""
	Returns only the text content of Tweets in the file(s)

	:return: the given file(s) as a list of Tweets.
	:rtype: list(str)
	"""
	fulltweets = self.docs(fileids)
	tweets = []
	for jsono in fulltweets:
	try:
	text = jsono["text"]
	if isinstance(text, bytes):
	text = text.decode(self.encoding)
	tweets.append(text)
	except KeyError:
	pass
	return tweets

	def tokenized(self, fileids=None):
	"""
	:return: the given file(s) as a list of the text content of Tweets as
	as a list of words, screenanames, hashtags, URLs and punctuation symbols.

	:rtype: list(list(str))
	"""
	tweets = self.strings(fileids)
	tokenizer = self._word_tokenizer
	return [tokenizer.tokenize(t) for t in tweets]

	def _read_tweets(self, stream):
	"""
	Assumes that each line in ``stream`` is a JSON-serialised object.
	"""
	tweets = []
	for i in range(10):
	line = stream.readline()
	if not line:
	return tweets
	tweet = json.loads(line)
	tweets.append(tweet)
	return tweets