test / app.py
jinwei12's picture
Update app.py
c432f1f
raw
history blame
13.1 kB
import torch
from transformers import AutoTokenizer,AutoModelForTokenClassification
from transformers import GeoLMModel
import requests
import numpy as np
import pandas as pd
import scipy.spatial as sp
import streamlit as st
import folium
from streamlit.components.v1 import html
from haversine import haversine, Unit
dataset=None
def generate_human_readable(tokens,labels):
ret = []
for t,lab in zip(tokens,labels):
if t == '[SEP]':
continue
if t.startswith("##") :
assert len(ret) > 0
ret[-1] = ret[-1] + t.strip('##')
elif lab==2:
assert len(ret) > 0
ret[-1] = ret[-1] + " "+ t.strip('##')
else:
ret.append(t)
return ret
def getSlice(tensor):
result = []
curr = []
for index, value in enumerate(tensor[0]):
if value == 1 or value == 2:
curr.append(index)
if value == 0 and curr != []:
result.append(curr)
curr = []
return result
def getIndex(input):
# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Tokenize input sentence
tokens = tokenizer.encode(input, return_tensors="pt")
# Pass tokens through the model
outputs = model(tokens)
# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)
predicted_labels = predicted_labels.detach().cpu().numpy()
# "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" }
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
# print(predicted_labels)
predicted_labels = torch.argmax(outputs.logits, dim=2)
# print(predicted_labels)
query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
print(predicted_labels)
print(predicted_labels.shape)
slices=getSlice(predicted_labels)
# print(tokenizer.convert_ids_to_tokens(query_tokens))
return slices
def cutSlices(tensor, slicesList):
locationTensor= torch.zeros(1, len(slicesList), 768)
curr=0
for slice in slicesList:
if len(slice)==1:
locationTensor[0][curr] = tensor[0][slice[0]]
curr=curr+1
if len(slice)>1 :
sliceTensor=tensor[0][slice[0]:slice[-1]+1]
#(len, 768)-> (1,len, 768)
sliceTensor = sliceTensor.unsqueeze(0)
mean = torch.mean(sliceTensor,dim=1,keepdim=True)
locationTensor[0][curr] = mean[0]
curr=curr+1
return locationTensor
def MLearningFormInput(input):
model_name = "zekun-li/geolm-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GeoLMModel.from_pretrained(model_name)
tokens = tokenizer.encode(input, return_tensors="pt")
# ['[CLS]', 'Minneapolis','[SEP]','Saint','Paul','[SEP]','Du','##lut','##h','[SEP]']
# print(tokens)
outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
# print(outputs.last_hidden_state)
# print(outputs.last_hidden_state.shape)
slicesIndex=getIndex(input)
# print(slicesIndex)
#tensor -> tensor
res= cutSlices(outputs.last_hidden_state, slicesIndex)
return res
def generate_human_readable(tokens,labels):
ret = []
for t,lab in zip(tokens,labels):
if t == '[SEP]':
continue
if t.startswith("##") :
assert len(ret) > 0
ret[-1] = ret[-1] + t.strip('##')
elif lab==2:
assert len(ret) > 0
ret[-1] = ret[-1] + " "+ t.strip('##')
else:
ret.append(t)
return ret
def getLocationName(input_sentence):
# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Tokenize input sentence
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
# Pass tokens through the model
outputs = model(tokens)
# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)
predicted_labels = predicted_labels.detach().cpu().numpy()
# "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" }
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
predicted_labels = torch.argmax(outputs.logits, dim=2)
query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
return human_readable
def search_geonames(toponym, df):
# GeoNames API endpoint
api_endpoint = "http://api.geonames.org/searchJSON"
username = "zekun"
print(toponym)
params = {
'q': toponym,
'username': username,
'maxRows':10
}
response = requests.get(api_endpoint, params=params)
data = response.json()
result = []
lat=[]
lon=[]
if 'geonames' in data:
for place_info in data['geonames']:
latitude = float(place_info.get('lat', 0.0))
longitude = float(place_info.get('lng', 0.0))
lat.append(latitude)
lon.append(longitude)
print(latitude)
print(longitude)
# getNeighborsDistance
id = place_info.get('geonameId', '')
print(id)
global dataset
res = get50Neigbors(id, dataset, k=50)
result.append(res)
# candidate_places.append({
# 'name': place_info.get('name', ''),
# 'country': place_info.get('countryName', ''),
# 'latitude': latitude,
# 'longitude': longitude,
# })
print(res)
df['lat'] = lat
df['lon'] = lon
result = torch.cat(result, dim=1).detach().numpy()
return result
def get50Neigbors(locationID, dataset, k=50):
input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:5])].copy()
filtered_dataset['distance'] = filtered_dataset.apply(
lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
axis=1
).copy()
filtered_dataset = filtered_dataset.sort_values(by='distance')
nearest_neighbors = filtered_dataset.head(k)[['Name']]
neighbors=nearest_neighbors.values.tolist()
model_name = "zekun-li/geolm-base-toponym-recognition"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
neighbor_token_list = []
neighbor_token_list.append(cls_token_id)
target_token=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(name))
for neighbor in neighbors:
neighbor_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(neighbor[0]))
neighbor_token_list.extend(neighbor_token)
neighbor_token_list.append(sep_token_id)
# print(tokenizer.convert_ids_to_tokens(neighbor_token_list))
#--------------------------------------------
model = GeoLMModel.from_pretrained(model_name)
tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
# input "new neighbor sentence"-> model -> output
outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
# print(outputs.last_hidden_state)
# print(outputs.last_hidden_state.shape)
targetIndex=list(range(1, len(target_token)+1))
# #tensor -> tensor
# get (1, len(target_token), 768) -> (1, 1, 768)
res=cutSlices(outputs.last_hidden_state, [targetIndex])
return res
def cosine_similarity(target_feature, candidate_feature):
target_feature = target_feature.squeeze()
candidate_feature = candidate_feature.squeeze()
dot_product = torch.dot(target_feature, candidate_feature)
target = torch.norm(target_feature)
candidate = torch.norm(candidate_feature)
similarity = dot_product / (target * candidate)
return similarity.item()
@st.cache_data
def getCSV():
dataset = pd.read_csv('geohash.csv')
return dataset
def showing(df):
m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
size_scale = 100
color_scale = 255
for i in range(len(df)):
lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
size = int(prob**2 * size_scale )
color = int(prob**2 * color_scale)
folium.CircleMarker(
location=[lat, lon],
radius=size,
color=f'#{color:02X}0000',
fill=True,
fill_color=f'#{color:02X}0000'
).add_to(m)
m.save("map.html")
with open("map.html", "r", encoding="utf-8") as f:
map_html = f.read()
st.components.v1.html(map_html, height=600)
def mapping(selected_place,locations, sentence_info):
location_index = locations.index(selected_place)
print(location_index)
df = pd.DataFrame()
# get same name for "Beijing" in geonames
same_name_embedding=search_geonames(selected_place, df)
sim_matrix=[]
print(sim_matrix)
same_name_embedding=torch.tensor(same_name_embedding)
# loop each "Beijing"
for i in range(same_name_embedding.size(1)):
print((sentence_info[:, location_index, :]).shape)
print((same_name_embedding[:, i, :]).shape)
similarities = cosine_similarity(sentence_info[:, location_index, :], same_name_embedding[:, i, :])
sim_matrix.append(similarities)
# print("Cosine Similarity Matrix:")
# print(sim_matrix)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
prob_matrix = sigmoid(np.array(sim_matrix))
df['prob'] = prob_matrix
print(df)
showing(df)
def show_on_map():
input = st.text_area("Enter a sentence:", height=200)
st.button("Submit")
sentence_info= MLearningFormInput(input)
print("sentence info: ")
print(sentence_info)
print(sentence_info.shape)
# input: a sentence -> output : locations
locations=getLocationName(input)
# 1. input: a sentence -> output: tensor (1sentence_info
selected_place = st.selectbox("Select a location:", locations)
if selected_place is not None:
mapping(selected_place, locations, sentence_info)
if __name__ == "__main__":
dataset = getCSV()
show_on_map()
# # can be hidding.............................................................
# #len: 80
# input= 'Minneapolis, officially the City of Minneapolis, is a city in the state of Minnesota and the county seat of Hennepin County. making it the largest city in Minnesota and the 46th-most-populous in the United States. Nicknamed the "City of Lakes", Minneapolis is abundant in water, with thirteen lakes, wetlands, the Mississippi River, creeks, and waterfalls.'
# 1. input: a sentence -> output: tensor (1,num_locations,768)
# sentence_info= MLearningFormInput(input)
# print("sentence info: ")
# print(sentence_info)
# print(sentence_info.shape)
# # input: a sentence -> output : locations
# locations=getLocationName(input)
# print(locations)
# j=0
# k=0
# for location in locations:
# if k==0:
# # input: locations -> output: search in geoname(get top 10 items) -> loop each item -> num_location x 10 x (1,1,768)
# same_name_embedding=search_geonames(location)
# sim_matrix=[]
# print(sim_matrix)
# same_name_embedding=torch.tensor(same_name_embedding)
# # loop each "Beijing"
# for i in range(same_name_embedding.size(1)):
# # print((sentence_info[:, j, :]).shape)
# # print((same_name_embedding[:, i, :]).shape)
# similarities = cosine_similarity(sentence_info[:, j, :], same_name_embedding[:, i, :])
# sim_matrix.append(similarities)
# j=j+1
# print("Cosine Similarity Matrix:")
# print(sim_matrix)
# k=1
# else:
# break