Spaces:

Kazel
/

collarvision

Sleeping

App Files Files Community

Kazel commited on Mar 31, 2025

Commit

5fd9575

verified ·

1 Parent(s): 5faca27

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -70

app.py CHANGED Viewed

@@ -1,70 +1,57 @@
-import gradio as gr
-import cv2
-import ollama
-import threading
-# Initialize the webcam
-cap = cv2.VideoCapture(0)
-def query_the_image(query: str, image_list: list[str]):
-    try:
-        res = ollama.chat(
-            model='llava',
-            options={
-                'temperature': 0,
-                "top_k": 1,
-                'top_p': 0.1,
-                'mirostat_tau': 1.0,
-                'num_ctx': 1024,
-                'seed': 42,
-                'num_predict': 128
-            },
-            messages=[
-                {
-                    'role': 'system',
-                    'content': "You are a home surveillance system. Answer with very short sentences."
-                },
-                {
-                    'role': 'user',
-                    'content': query,
-                    'images': image_list,
-                }
-            ]
-        )
-        return res['message']['content']
-    except Exception as e:
-        return f"Error: {e}"
-def get_frame():
-    ret, frame = cap.read()
-    if not ret:
-        return None
-    _, buffer = cv2.imencode('.jpg', frame)
-    return buffer.tobytes()
-def process_image(prompt):
-    frame_data = get_frame()
-    if frame_data:
-        return query_the_image(prompt, [frame_data])
-    return "Error capturing image"
-def video_feed():
-    while True:
-        ret, frame = cap.read()
-        if ret:
-            yield cv2.imencode('.jpg', frame)[1].tobytes()
-        else:
-            break
-gui = gr.Blocks()
-with gui:
-    gr.Markdown("# Live Video AI Assistant")
-    with gr.Row():
-        video_component = gr.Video()
-        threading.Thread(target=video_feed, daemon=True).start()
-    prompt = gr.Textbox(label="Enter your question")
-    response = gr.Textbox(label="AI Response")
-    btn = gr.Button("Ask")
-    btn.click(process_image, inputs=prompt, outputs=response)
-gui.launch()

+import gradio as gr
+import cv2
+import threading
+import torch
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from PIL import Image
+# Initialize the webcam
+cap = cv2.VideoCapture(0)
+# Load the Hugging Face model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
+def query_the_image(query: str, image_data: bytes):
+    try:
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        inputs = processor(image, query, return_tensors="pt").to(model.device)
+        output = model.generate(**inputs)
+        answer = processor.decode(output[0], skip_special_tokens=True)
+        return answer
+    except Exception as e:
+        return f"Error: {e}"
+def get_frame():
+    ret, frame = cap.read()
+    if not ret:
+        return None
+    _, buffer = cv2.imencode('.jpg', frame)
+    return buffer.tobytes()
+def process_image(prompt):
+    frame_data = get_frame()
+    if frame_data:
+        return query_the_image(prompt, frame_data)
+    return "Error capturing image"
+def video_feed():
+    while True:
+        ret, frame = cap.read()
+        if ret:
+            yield cv2.imencode('.jpg', frame)[1].tobytes()
+        else:
+            break
+gui = gr.Blocks()
+with gui:
+    gr.Markdown("# Live Video AI Assistant")
+    with gr.Row():
+        video_component = gr.Video()
+        threading.Thread(target=video_feed, daemon=True).start()
+    prompt = gr.Textbox(label="Enter your question")
+    response = gr.Textbox(label="AI Response")
+    btn = gr.Button("Ask")
+    btn.click(process_image, inputs=prompt, outputs=response)
+gui.launch()