File size: 3,111 Bytes
3e22f77
31ae1e9
3e22f77
7a321ee
3e22f77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beb01b9
3e22f77
 
7a321ee
3e22f77
 
7a321ee
3e22f77
 
7a321ee
3e22f77
 
 
 
 
 
 
 
 
 
e0e99ed
3e22f77
31ae1e9
 
 
 
 
 
 
 
 
 
910a9eb
31ae1e9
910a9eb
31ae1e9
3e22f77
 
7a321ee
 
 
 
 
 
3e22f77
 
910a9eb
7a321ee
3e22f77
7a321ee
 
 
31ae1e9
7a321ee
 
 
 
 
3e22f77
7a321ee
da3ad5a
 
 
 
3f23b29
e804a95
 
 
6534a76
7a321ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import requests
import gradio as gr
import pandas as pd
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

def process_tweet(tweet):
    # remove links
    tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet)
    # remove usernames
    tweet = re.sub('@[^\s]+', '', tweet)
    # remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # replace hashtags with words
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # trim
    tweet = tweet.strip('\'"')
    return tweet

tokenizer = AutoTokenizer.from_pretrained(
    "azamat/geocoder_coordinates_model"
)

relevancy_pipeline = pipeline("sentiment-analysis", model="azamat/geocoder_relevancy_model")

coordinates_model = AutoModelForSequenceClassification.from_pretrained(
    "azamat/geocoder_coordinates_model",
)

def predict_relevancy(text):
    outputs = relevancy_pipeline(text)
    return outputs[0]['label'], outputs[0]['score']

def predict_coordinates(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, \
        max_length=128, return_tensors='pt')
    outputs = coordinates_model(**encoding)
    return round(outputs[0][0][0].item(), 3), round(outputs[0][0][1].item(), 3)

def reverse_geocode(lat, lon):
    payload = {
        'lat'             : lat, 
        'lon'             : lon, 
        'zoom'            : 12, 
        'format'          : 'jsonv2',
        'accept-language' : 'en'
    }
    try:
        r = requests.get('https://geocode.maps.co/reverse', params=payload)
        return r.json()['display_name']
    except:
        return "No data"

def predict(text):
    text = process_tweet(text)
    data = {
        "relevancy_score"  : 0,
        "lat"              : 0,
        "lon"              : 0,
        "reversed lat/lon" : ""
    }
    relevancy_label, relevancy_score = predict_relevancy(text)
    if relevancy_label == 'relevant':
        data['relevancy_score'] = round(relevancy_score * 100, 2)
        
        lat, lon = predict_coordinates(text)
        data['lat'] = lat
        data['lon'] = lon
        
        reverse_geocoded = reverse_geocode(lat, lon)
        data['reversed lat/lon'] = reverse_geocoded
        
    return pd.DataFrame([data])

with gr.Blocks() as demo:

    gr.Markdown("# **<p align='center'>Twitter geocoding with 🤗 Transformers</p>**")
    gr.Markdown("### <div align='left'>Pipeline consists of:</div>")
    gr.Markdown("### <div align='left'>1) Relevancy scoring model - predicts whether a tweet has geocoding related information</div>")
    gr.Markdown("### <div align='left'>2) Coordinate predicting model - predicts exact latitude and longitude of user by tweet</div>")
    gr.Markdown("### <div align='left'>3) Nominatim API for reverse geocoding lat/lon - uses open street map to reverse geocode lat and lon</div>")

    inputs = gr.Textbox(placeholder="Enter the tweet")
    outputs = [gr.Dataframe(label="Geocoded data")]
    inputs.submit(predict, inputs=inputs, outputs=outputs)

if __name__ == "__main__":
    demo.launch()