File size: 10,120 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ewan Klein <[email protected]>
#         Lorenzo Rubio <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

Utility functions for the `twitterclient` module which do not require

the `twython` library to have been installed.

"""
import csv
import gzip
import json

from nltk.internals import deprecated

HIER_SEPARATOR = "."


def extract_fields(tweet, fields):
    """

    Extract field values from a full tweet and return them as a list



    :param json tweet: The tweet in JSON format

    :param list fields: The fields to be extracted from the tweet

    :rtype: list(str)

    """
    out = []
    for field in fields:
        try:
            _add_field_to_out(tweet, field, out)
        except TypeError as e:
            raise RuntimeError(
                "Fatal error when extracting fields. Cannot find field ", field
            ) from e
    return out


def _add_field_to_out(json, field, out):
    if _is_composed_key(field):
        key, value = _get_key_value_composed(field)
        _add_field_to_out(json[key], value, out)
    else:
        out += [json[field]]


def _is_composed_key(field):
    return HIER_SEPARATOR in field


def _get_key_value_composed(field):
    out = field.split(HIER_SEPARATOR)
    # there could be up to 3 levels
    key = out[0]
    value = HIER_SEPARATOR.join(out[1:])
    return key, value


def _get_entity_recursive(json, entity):
    if not json:
        return None
    elif isinstance(json, dict):
        for key, value in json.items():
            if key == entity:
                return value
            # 'entities' and 'extended_entities' are wrappers in Twitter json
            # structure that contain other Twitter objects. See:
            # https://dev.twitter.com/overview/api/entities-in-twitter-objects

            if key == "entities" or key == "extended_entities":
                candidate = _get_entity_recursive(value, entity)
                if candidate is not None:
                    return candidate
        return None
    elif isinstance(json, list):
        for item in json:
            candidate = _get_entity_recursive(item, entity)
            if candidate is not None:
                return candidate
        return None
    else:
        return None


def json2csv(

    fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False

):
    """

    Extract selected fields from a file of line-separated JSON tweets and

    write to a file in CSV format.



    This utility function allows a file of full tweets to be easily converted

    to a CSV file for easier processing. For example, just TweetIDs or

    just the text content of the Tweets can be extracted.



    Additionally, the function allows combinations of fields of other Twitter

    objects (mainly the users, see below).



    For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see

    `json2csv_entities`



    :param str infile: The name of the file containing full tweets



    :param str outfile: The name of the text file where results should be\

    written



    :param list fields: The list of fields to be extracted. Useful examples\

    are 'id_str' for the tweetID and 'text' for the text of the tweet. See\

    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\

    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\

    Additionally, it allows IDs from other Twitter objects, e. g.,\

    ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']



    :param error: Behaviour for encoding errors, see\

    https://docs.python.org/3/library/codecs.html#codec-base-classes



    :param gzip_compress: if `True`, output files are compressed with gzip

    """
    (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
    # write the list of fields as header
    writer.writerow(fields)
    # process the file
    for line in fp:
        tweet = json.loads(line)
        row = extract_fields(tweet, fields)
        writer.writerow(row)
    outf.close()


@deprecated("Use open() and csv.writer() directly instead.")
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """Get a CSV writer with optional compression."""
    return _outf_writer(outfile, encoding, errors, gzip_compress)


def _outf_writer(outfile, encoding, errors, gzip_compress=False):
    if gzip_compress:
        outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors)
    else:
        outf = open(outfile, "w", newline="", encoding=encoding, errors=errors)
    writer = csv.writer(outf)
    return (writer, outf)


def json2csv_entities(

    tweets_file,

    outfile,

    main_fields,

    entity_type,

    entity_fields,

    encoding="utf8",

    errors="replace",

    gzip_compress=False,

):
    """

    Extract selected fields from a file of line-separated JSON tweets and

    write to a file in CSV format.



    This utility function allows a file of full Tweets to be easily converted

    to a CSV file for easier processing of Twitter entities. For example, the

    hashtags or media elements of a tweet can be extracted.



    It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags

    there will be two lines in the output file, one per hashtag



    :param tweets_file: the file-like object containing full Tweets



    :param str outfile: The path of the text file where results should be\

        written



    :param list main_fields: The list of fields to be extracted from the main\

        object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\

        <https://dev.twitter.com/overview/api/tweets> for a full list of fields.

        e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']

        If `entity_type` is expressed with hierarchy, then it is the list of\

        fields of the object that corresponds to the key of the entity_type,\

        (e.g., for entity_type='user.urls', the fields in the main_fields list\

        belong to the user object; for entity_type='place.bounding_box', the\

        files in the main_field list belong to the place object of the tweet).



    :param list entity_type: The name of the entity: 'hashtags', 'media',\

        'urls' and 'user_mentions' for the tweet object. For a user object,\

        this needs to be expressed with a hierarchy: `'user.urls'`. For the\

        bounding box of the Tweet location, use `'place.bounding_box'`.



    :param list entity_fields: The list of fields to be extracted from the\

        entity. E.g. `['text']` (of the Tweet)



    :param error: Behaviour for encoding errors, see\

        https://docs.python.org/3/library/codecs.html#codec-base-classes



    :param gzip_compress: if `True`, output files are compressed with gzip

    """

    (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
    header = get_header_field_list(main_fields, entity_type, entity_fields)
    writer.writerow(header)
    for line in tweets_file:
        tweet = json.loads(line)
        if _is_composed_key(entity_type):
            key, value = _get_key_value_composed(entity_type)
            object_json = _get_entity_recursive(tweet, key)
            if not object_json:
                # this can happen in the case of "place"
                continue
            object_fields = extract_fields(object_json, main_fields)
            items = _get_entity_recursive(object_json, value)
            _write_to_file(object_fields, items, entity_fields, writer)
        else:
            tweet_fields = extract_fields(tweet, main_fields)
            items = _get_entity_recursive(tweet, entity_type)
            _write_to_file(tweet_fields, items, entity_fields, writer)
    outf.close()


def get_header_field_list(main_fields, entity_type, entity_fields):
    if _is_composed_key(entity_type):
        key, value = _get_key_value_composed(entity_type)
        main_entity = key
        sub_entity = value
    else:
        main_entity = None
        sub_entity = entity_type

    if main_entity:
        output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
    else:
        output1 = main_fields
    output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
    return output1 + output2


def _write_to_file(object_fields, items, entity_fields, writer):
    if not items:
        # it could be that the entity is just not present for the tweet
        # e.g. tweet hashtag is always present, even as [], however
        # tweet media may not be present
        return
    if isinstance(items, dict):
        # this happens e.g. for "place" of a tweet
        row = object_fields
        # there might be composed keys in de list of required fields
        entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
        entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
        for field in entity_field_values:
            value = items[field]
            if isinstance(value, list):
                row += value
            else:
                row += [value]
        # now check required dictionaries
        for d in entity_field_composed:
            kd, vd = _get_key_value_composed(d)
            json_dict = items[kd]
            if not isinstance(json_dict, dict):
                raise RuntimeError(
                    """Key {} does not contain a dictionary

                in the json file""".format(
                        kd
                    )
                )
            row += [json_dict[vd]]
        writer.writerow(row)
        return
    # in general it is a list
    for item in items:
        row = object_fields + extract_fields(item, entity_fields)
        writer.writerow(row)