intuitive262 commited on
Commit
d529377
·
1 Parent(s): 1fb71cb

Updated code files

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -9,7 +9,7 @@ import re
9
  rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
10
  vlm = Qwen2VLForConditionalGeneration.from_pretrained(
11
  "Qwen/Qwen2-VL-2B-Instruct",
12
- torch_dtype=torch.float16,
13
  trust_remote_code=True,
14
  device_map="auto",
15
  )
@@ -32,9 +32,9 @@ def extract_text(image, query):
32
  image_inputs, video_inputs = process_vision_info(messages)
33
  inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
34
  inputs = inputs.to("cpu")
35
-
36
- generated_ids = vlm.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
37
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
  def search_text(text, query):
 
9
  rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
10
  vlm = Qwen2VLForConditionalGeneration.from_pretrained(
11
  "Qwen/Qwen2-VL-2B-Instruct",
12
+ torch_dtype=torch.float32,
13
  trust_remote_code=True,
14
  device_map="auto",
15
  )
 
32
  image_inputs, video_inputs = process_vision_info(messages)
33
  inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
34
  inputs = inputs.to("cpu")
35
+ with torch.no_grad():
36
+ generated_ids = vlm.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
37
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
  def search_text(text, query):