Moshe Ofer commited on
Commit
b8e37ed
·
1 Parent(s): 3ef4ce6

Initial commit for Hugging Face Space

Browse files
Files changed (2) hide show
  1. Dockerfile +5 -0
  2. app.py +45 -48
Dockerfile CHANGED
@@ -9,8 +9,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
9
 
10
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
11
  ENV HF_HOME=/app/cache
 
12
  ENV PYTHONUNBUFFERED=1
13
  ENV EVENTLET_NO_GREENDNS=yes
 
 
14
 
15
  COPY . /app
16
 
@@ -25,5 +28,7 @@ CMD ["gunicorn", \
25
  "--timeout", "300", \
26
  "--keep-alive", "120", \
27
  "--log-level", "debug", \
 
 
28
  "--bind", "0.0.0.0:7860", \
29
  "app:app"]
 
9
 
10
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
11
  ENV HF_HOME=/app/cache
12
+
13
  ENV PYTHONUNBUFFERED=1
14
  ENV EVENTLET_NO_GREENDNS=yes
15
+ ENV EVENTLET_THREADPOOL_SIZE=32
16
+ ENV EVENTLET_WEBSOCKET_MONITOR_TIMEOUT=60
17
 
18
  COPY . /app
19
 
 
28
  "--timeout", "300", \
29
  "--keep-alive", "120", \
30
  "--log-level", "debug", \
31
+ "--worker-connections", "1000", \
32
+ "--backlog", "2048", \
33
  "--bind", "0.0.0.0:7860", \
34
  "app:app"]
app.py CHANGED
@@ -45,7 +45,7 @@ class WebSocketBeamStreamer(MultiBeamTextStreamer):
45
  """Send beam updates through websocket with delay"""
46
  self.beam_texts[beam_idx] = new_text
47
  if self.sleep_time > 0:
48
- time.sleep(self.sleep_time / 1000) # Convert milliseconds to seconds
49
  socketio.emit('beam_update', {
50
  'beam_idx': beam_idx,
51
  'text': new_text
@@ -65,58 +65,55 @@ def index():
65
 
66
  @socketio.on('generate')
67
  def handle_generation(data):
68
- try:
69
- app.logger.info("Generation started with data: %s", data)
70
- socketio.emit('generation_started')
71
-
72
- prompt = data['prompt']
73
- num_beams = data.get('num_beams', 5)
74
- max_new_tokens = data.get('max_tokens', 512)
75
- sleep_time = data.get('sleep_time', 0)
76
-
77
- app.logger.info("Processing with parameters: beams=%d, max_tokens=%d",
78
- num_beams, max_new_tokens)
79
-
80
- messages = [
81
- {"role": "system", "content": "You are a helpful assistant."},
82
- {"role": "user", "content": prompt}
83
- ]
84
-
85
- text = tokenizer.apply_chat_template(
86
- messages,
87
- tokenize=False,
88
- add_generation_prompt=True
89
- )
90
 
91
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
92
 
93
- streamer = WebSocketBeamStreamer(
94
- tokenizer=tokenizer,
95
- num_beams=num_beams,
96
- sleep_time=sleep_time,
97
- skip_prompt=True
98
- )
99
-
100
- with torch.no_grad():
101
- model.generate(
102
- **model_inputs,
103
  num_beams=num_beams,
104
- num_return_sequences=num_beams,
105
- max_new_tokens=max_new_tokens,
106
- output_scores=True,
107
- return_dict_in_generate=True,
108
- early_stopping=True,
109
- streamer=streamer
110
  )
111
 
112
- app.logger.info("Generation completed successfully")
113
-
114
- except Exception as e:
115
- app.logger.error("Generation error: %s", str(e), exc_info=True)
116
- socketio.emit('generation_error', {'error': str(e)})
117
- finally:
118
- socketio.emit('generation_completed')
119
-
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  if __name__ == '__main__':
122
  socketio.run(
 
45
  """Send beam updates through websocket with delay"""
46
  self.beam_texts[beam_idx] = new_text
47
  if self.sleep_time > 0:
48
+ eventlet.sleep(self.sleep_time / 1000) # Convert milliseconds to seconds
49
  socketio.emit('beam_update', {
50
  'beam_idx': beam_idx,
51
  'text': new_text
 
65
 
66
  @socketio.on('generate')
67
  def handle_generation(data):
68
+ def generate_async():
69
+ try:
70
+ app.logger.info("Generation started with data: %s", data)
71
+ socketio.emit('generation_started', callback=lambda: eventlet.sleep(0))
72
+
73
+ prompt = data['prompt']
74
+ num_beams = data.get('num_beams', 5)
75
+ max_new_tokens = data.get('max_tokens', 512)
76
+ sleep_time = data.get('sleep_time', 0)
77
+
78
+ messages = [
79
+ {"role": "system", "content": "You are a helpful assistant."},
80
+ {"role": "user", "content": prompt}
81
+ ]
82
+
83
+ text = tokenizer.apply_chat_template(
84
+ messages,
85
+ tokenize=False,
86
+ add_generation_prompt=True
87
+ )
 
 
88
 
89
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
90
 
91
+ streamer = WebSocketBeamStreamer(
92
+ tokenizer=tokenizer,
 
 
 
 
 
 
 
 
93
  num_beams=num_beams,
94
+ sleep_time=sleep_time,
95
+ skip_prompt=True
 
 
 
 
96
  )
97
 
98
+ with torch.no_grad():
99
+ model.generate(
100
+ **model_inputs,
101
+ num_beams=num_beams,
102
+ num_return_sequences=num_beams,
103
+ max_new_tokens=max_new_tokens,
104
+ output_scores=True,
105
+ return_dict_in_generate=True,
106
+ early_stopping=True,
107
+ streamer=streamer
108
+ )
109
+
110
+ except Exception as e:
111
+ app.logger.error("Generation error: %s", str(e), exc_info=True)
112
+ socketio.emit('generation_error', {'error': str(e)})
113
+ finally:
114
+ socketio.emit('generation_completed')
115
+
116
+ eventlet.spawn(generate_async)
117
 
118
  if __name__ == '__main__':
119
  socketio.run(