File size: 2,982 Bytes
3e22f77
31ae1e9
3e22f77
7a321ee
3e22f77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a321ee
3e22f77
 
7a321ee
3e22f77
 
7a321ee
3e22f77
 
 
 
 
 
 
 
 
 
e0e99ed
3e22f77
31ae1e9
 
 
 
 
 
 
 
 
 
910a9eb
31ae1e9
910a9eb
31ae1e9
3e22f77
 
7a321ee
 
 
 
 
 
3e22f77
 
910a9eb
7a321ee
3e22f77
7a321ee
 
 
31ae1e9
7a321ee
 
 
 
 
3e22f77
7a321ee
3f23b29
 
 
 
 
e804a95
 
 
6534a76
7a321ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import requests
import gradio as gr
import pandas as pd
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

def process_tweet(tweet):
    # remove links
    tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet)
    # remove usernames
    tweet = re.sub('@[^\s]+', '', tweet)
    # remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # replace hashtags with words
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # trim
    tweet = tweet.strip('\'"')
    return tweet #if len(tweet) > 0 else ""

tokenizer = AutoTokenizer.from_pretrained(
    "azamat/geocoder_coordinates_model"
)

relevancy_pipeline = pipeline("sentiment-analysis", model="azamat/geocoder_relevancy_model")

coordinates_model = AutoModelForSequenceClassification.from_pretrained(
    "azamat/geocoder_coordinates_model",
)

def predict_relevancy(text):
    outputs = relevancy_pipeline(text)
    return outputs[0]['label'], outputs[0]['score']

def predict_coordinates(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, \
        max_length=128, return_tensors='pt')
    outputs = coordinates_model(**encoding)
    return round(outputs[0][0][0].item(), 3), round(outputs[0][0][1].item(), 3)

def reverse_geocode(lat, lon):
    payload = {
        'lat'             : lat, 
        'lon'             : lon, 
        'zoom'            : 12, 
        'format'          : 'jsonv2',
        'accept-language' : 'en'
    }
    try:
        r = requests.get('https://geocode.maps.co/reverse', params=payload)
        return r.json()['display_name']
    except:
        return "No data"

def predict(text):
    text = process_tweet(text)
    data = {
        "relevancy_score"  : 0,
        "lat"              : 0,
        "lon"              : 0,
        "reversed lat/lon" : ""
    }
    relevancy_label, relevancy_score = predict_relevancy(text)
    if relevancy_label == 'relevant':
        data['relevancy_score'] = round(relevancy_score * 100, 2)
        
        lat, lon = predict_coordinates(text)
        data['lat'] = lat
        data['lon'] = lon
        
        reverse_geocoded = reverse_geocode(lat, lon)
        data['reversed lat/lon'] = reverse_geocoded
        
    return pd.DataFrame([data])

with gr.Blocks() as demo:

    gr.Markdown("# **<p align='center'>Twitter geocoding with 🤗 Transformers</p>**")
    gr.Markdown("# **<div align='center'>Pipeline consists of:</div>**")
    gr.Markdown("# **<div align='center'>1) Relevancy scoring model</div>**")
    gr.Markdown("# **<div align='center'>2) Coordinate predicting model</div>**")
    gr.Markdown("# **<div align='center'>3) Nominatim API for reverse geocoding lat/lon</div>**")

    inputs = gr.Textbox(placeholder="Enter the tweet")
    outputs = [gr.Dataframe(label="Geocoded data")]
    inputs.submit(predict, inputs=inputs, outputs=outputs)

if __name__ == "__main__":
    demo.launch()