radames commited on
Commit
9663a4b
·
1 Parent(s): 57535ba

add transformers emmbeddings and UMAP

Browse files
Files changed (3) hide show
  1. app.py +29 -84
  2. embeddings_encoder.py +45 -0
  3. umap_reducer.py +12 -22
app.py CHANGED
@@ -1,9 +1,9 @@
1
  from umap_reducer import UMAPReducer
 
2
  from flask import Flask, request, render_template, jsonify, make_response
3
  from flask_cors import CORS
4
  import os
5
  from dotenv import load_dotenv
6
- from transformers import pipeline
7
  import feedparser
8
  import json
9
  from dateutil import parser
@@ -13,12 +13,10 @@ import gzip
13
 
14
  load_dotenv()
15
 
16
- # Load Setiment Classifier
17
- # sentiment_analysis = pipeline(
18
- # "sentiment-analysis", model="siebert/sentiment-roberta-large-english")
19
  app = Flask(__name__, static_url_path='/static')
20
  reducer = UMAPReducer()
21
-
22
  CORS(app)
23
 
24
 
@@ -27,87 +25,34 @@ def index():
27
  return render_template('index.html')
28
 
29
 
30
- @app.route('/run-umap') # //methods=['POST'])
31
  def run_umap():
32
- data = np.random.rand(512, 4)
33
-
34
- # UMAP embeddings
35
- embeddings = reducer.embed(data)
36
-
37
- content = gzip.compress(json.dumps(embeddings.tolist()).encode('utf8'), 5)
38
- response = make_response(content)
39
- response.headers['Content-length'] = len(content)
40
- response.headers['Content-Encoding'] = 'gzip'
41
- return response
42
-
43
-
44
- # @app.route('/news')
45
- # def get_news():
46
- # feed_url = request.args.get('feed_url')
47
- # # check if string is a valid
48
-
49
- # # file name for cache
50
- # file_name = "".join(re.split(r"https://|\.|/", feed_url))
51
-
52
- # feed_entries = get_feed(feed_url)
53
- # # filter only titles for sentiment analysis
54
- # try:
55
- # with open(f'{file_name}_cache.json') as file:
56
- # cache = json.load(file)
57
- # except:
58
- # cache = {}
59
-
60
- # # if new homepage is newer than cache, update cache and return
61
- # print("new date", feed_entries['last_update'])
62
- # print("old date", cache['last_update']
63
- # if 'last_update' in cache else "None")
64
- # if not cache or parser.parse(feed_entries['last_update']) > parser.parse(cache['last_update']):
65
- # print("Updating cache with new preditions")
66
- # titles = [entry['title'] for entry in feed_entries['entries']]
67
- # # run sentiment analysis on titles
68
- # predictions = [sentiment_analysis(sentence) for sentence in titles]
69
- # # parse Negative and Positive, normalize to -1 to 1
70
- # predictions = [-prediction[0]['score'] if prediction[0]['label'] ==
71
- # 'NEGATIVE' else prediction[0]['score'] for prediction in predictions]
72
- # # merge rss data with predictions
73
- # entries_predicitons = [{**entry, 'sentiment': prediction}
74
- # for entry, prediction in zip(feed_entries['entries'], predictions)]
75
- # output = {'entries': entries_predicitons,
76
- # 'last_update': feed_entries['last_update']}
77
- # # update last precitions cache
78
- # with open(f'{file_name}_cache.json', 'w') as file:
79
- # json.dump(output, file)
80
- # # send back json
81
- # return jsonify(output)
82
- # else:
83
- # print("Returning cached predictions")
84
- # return jsonify(cache)
85
-
86
-
87
- # @ app.route('/predict', methods=['POST'])
88
- # def predict():
89
- # # get data from POST
90
- # if request.method == 'POST':
91
- # # get current news
92
- # # get post body data
93
- # data = request.get_json()
94
- # if data.get('sentences') is None:
95
- # return jsonify({'error': 'No text provided'})
96
- # # get post expeceted to be under {'sentences': ['text': '...']}
97
- # sentences = data.get('sentences')
98
- # # prencit sentiments
99
- # predictions = [sentiment_analysis(sentence) for sentence in sentences]
100
- # # parse Negative and Positive, normalize to -1 to 1
101
- # predictions = [-prediction[0]['score'] if prediction[0]['label'] ==
102
- # 'NEGATIVE' else prediction[0]['score'] for prediction in predictions]
103
- # output = [dict(sentence=sentence, sentiment=prediction)
104
- # for sentence, prediction in zip(sentences, predictions)]
105
- # # send back json
106
- # return jsonify(output)
107
 
108
 
109
- # def get_feed(feed_url):
110
- # feed = feedparser.parse(feed_url)
111
- # return {'entries': feed['entries'], 'last_update': feed["feed"]['updated']}
112
  if __name__ == '__main__':
113
  app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))
 
1
  from umap_reducer import UMAPReducer
2
+ from embeddings_encoder import EmbeddingsEncoder
3
  from flask import Flask, request, render_template, jsonify, make_response
4
  from flask_cors import CORS
5
  import os
6
  from dotenv import load_dotenv
 
7
  import feedparser
8
  import json
9
  from dateutil import parser
 
13
 
14
  load_dotenv()
15
 
16
+
 
 
17
  app = Flask(__name__, static_url_path='/static')
18
  reducer = UMAPReducer()
19
+ encoder = EmbeddingsEncoder()
20
  CORS(app)
21
 
22
 
 
25
  return render_template('index.html')
26
 
27
 
28
+ @app.route('/run-umap', methods=['POST'])
29
  def run_umap():
30
+ input_data = request.get_json()
31
+ sentences = input_data['data']['sentences']
32
+ umap_options = input_data['data']['umap_options']
33
+ cluster_options = input_data['data']['cluster_options']
34
+
35
+ print("input options:", umap_options, cluster_options)
36
+ try:
37
+ embeddings = encoder.encode(sentences)
38
+ # UMAP embeddings
39
+ reducer.setParams(umap_options, cluster_options)
40
+ umap_embeddings = reducer.embed(embeddings)
41
+ # HDBScan cluster analysis
42
+ clusters = reducer.clusterAnalysis(umap_embeddings)
43
+ content = gzip.compress(json.dumps(
44
+ {
45
+ "embeddings": umap_embeddings.tolist(),
46
+ "clusters": clusters.labels_.tolist()
47
+ }
48
+ ).encode('utf8'), 5)
49
+ response = make_response(content)
50
+ response.headers['Content-length'] = len(content)
51
+ response.headers['Content-Encoding'] = 'gzip'
52
+ return response
53
+ except Exception as e:
54
+ return jsonify({"error": str(e)}), 201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
 
 
 
57
  if __name__ == '__main__':
58
  app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))
embeddings_encoder.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ import torch.nn.functional as F
5
+
6
+
7
+ class EmbeddingsEncoder:
8
+ def __init__(self):
9
+ # Load model from HuggingFace Hub
10
+ self.tokenizer = AutoTokenizer.from_pretrained(
11
+ 'sentence-transformers/all-MiniLM-L6-v2')
12
+ self.model = AutoModel.from_pretrained(
13
+ 'sentence-transformers/all-MiniLM-L6-v2')
14
+
15
+ # Mean Pooling - Take average of all tokens
16
+
17
+ def mean_pooling(self, model_output, attention_mask):
18
+ # First element of model_output contains all token embeddings
19
+ token_embeddings = model_output.last_hidden_state
20
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
21
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
22
+
23
+ # Encode text
24
+
25
+ def encode(self, texts):
26
+ # Tokenize sentences
27
+ print("Tokenizing...")
28
+ encoded_input = self.tokenizer(
29
+ texts, padding=True, truncation=True, return_tensors='pt')
30
+
31
+ # Compute token embeddings
32
+ print("Computing embeddings...")
33
+ with torch.no_grad():
34
+ model_output = self.model(**encoded_input, return_dict=True)
35
+
36
+ # Perform pooling
37
+ print("Performing pooling...")
38
+ embeddings = self.mean_pooling(
39
+ model_output, encoded_input['attention_mask'])
40
+
41
+ # Normalize embeddings
42
+ print("Normalizing embeddings...")
43
+ embeddings = F.normalize(embeddings, p=2, dim=1)
44
+
45
+ return embeddings
umap_reducer.py CHANGED
@@ -2,36 +2,26 @@ import umap
2
  import hdbscan
3
  import copy
4
 
 
5
  class UMAPReducer:
6
- def __init__(self, options={}):
7
 
8
  # set options with defaults
9
- options = {'n_components': 3, 'spread': 1, 'min_dist': 0.1, 'n_neighbors': 15,
10
- 'metric': 'hellinger', 'min_cluster_size': 60, 'min_samples': 15, **options}
11
-
12
- print(options)
13
- self.reducer = umap.UMAP(
14
- n_neighbors=options['n_neighbors'],
15
- min_dist=options['min_dist'],
16
- n_components=options['n_components'],
17
- metric=options['metric'],
18
- verbose=True)
19
- # cluster init
20
- self.clusterer = hdbscan.HDBSCAN(
21
- min_cluster_size=options['min_cluster_size'],
22
- min_samples=options['min_samples'],
23
- allow_single_cluster=True
24
- )
25
- self.cluster_params = copy.deepcopy(options)
26
 
27
- def setParams(self, options):
28
  # update params
29
- self.cluster_params = {**self.cluster_params, **options}
 
30
 
31
  def clusterAnalysis(self, data):
32
- clusters = self.clusterer.fit(data)
 
33
  return clusters
34
 
35
  def embed(self, data):
36
- result = self.reducer.fit_transform(data)
 
37
  return result
 
2
  import hdbscan
3
  import copy
4
 
5
+
6
  class UMAPReducer:
7
+ def __init__(self, umap_options={}, cluster_options={}):
8
 
9
  # set options with defaults
10
+ self.umap_options = {'n_components': 2, 'spread': 1, 'min_dist': 0.1, 'n_neighbors': 15,
11
+ 'metric': 'cosine', "verbose": True, **umap_options}
12
+ self.cluster_options = {'allow_single_cluster': True, 'min_cluster_size': 500, 'min_samples': 10, **cluster_options}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ def setParams(self, umap_options={}, cluster_options={}):
15
  # update params
16
+ self.umap_options = {**self.umap_options, **umap_options}
17
+ self.cluster_options = {**self.cluster_options, **cluster_options}
18
 
19
  def clusterAnalysis(self, data):
20
+ print("Cluster params:", self.cluster_options)
21
+ clusters = hdbscan.HDBSCAN().fit(data) # **self.cluster_options
22
  return clusters
23
 
24
  def embed(self, data):
25
+ print("UMAP params:", self.umap_options)
26
+ result = umap.UMAP(**self.umap_options).fit_transform(data)
27
  return result