Spaces:

ahmed-masry
/

ColFlor-Demo

Running on Zero

App Files Files Community

ahmed-masry commited on 15 days ago

Commit

9f9c2cc

•

1 Parent(s): e8ad0b4

Create processing_utils.py

Browse files

Files changed (1) hide show

processing_utils.py +121 -0

processing_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+import torch
+from PIL import Image
+from transformers import BatchEncoding, BatchFeature
+def get_torch_device(device: str = "auto") -> str:
+ """
+ Returns the device (string) to be used by PyTorch.
+ `device` arg defaults to "auto" which will use:
+ - "cuda:0" if available
+ - else "mps" if available
+ - else "cpu".
+ """
+ if device == "auto":
+ if torch.cuda.is_available():
+ device = "cuda:0"
+ elif torch.backends.mps.is_available(): # for Apple Silicon
+ device = "mps"
+ else:
+ device = "cpu"
+ logger.info(f"Using device: {device}")
+ return device
+class BaseVisualRetrieverProcessor(ABC):
+ """
+ Base class for visual retriever processors.
+ """
+ @abstractmethod
+ def process_images(
+ self,
+ images: List[Image.Image],
+ ) -> Union[BatchFeature, BatchEncoding]:
+ pass
+ @abstractmethod
+ def process_queries(
+ self,
+ queries: List[str],
+ max_length: int = 50,
+ suffix: Optional[str] = None,
+ ) -> Union[BatchFeature, BatchEncoding]:
+ pass
+ @abstractmethod
+ def score(
+ self,
+ qs: List[torch.Tensor],
+ ps: List[torch.Tensor],
+ device: Optional[Union[str, torch.device]] = None,
+ **kwargs,
+ ) -> torch.Tensor:
+ pass
+ @staticmethod
+ def score_single_vector(
+ qs: List[torch.Tensor],
+ ps: List[torch.Tensor],
+ device: Optional[Union[str, torch.device]] = None,
+ ) -> torch.Tensor:
+ """
+ Compute the dot product score for the given single-vector query and passage embeddings.
+ """
+ device = device or get_torch_device("auto")
+ if len(qs) == 0:
+ raise ValueError("No queries provided")
+ if len(ps) == 0:
+ raise ValueError("No passages provided")
+ qs_stacked = torch.stack(qs).to(device)
+ ps_stacked = torch.stack(ps).to(device)
+ scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
+ assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
+ scores = scores.to(torch.float32)
+ return scores
+ @staticmethod
+ def score_multi_vector(
+ qs: List[torch.Tensor],
+ ps: List[torch.Tensor],
+ batch_size: int = 128,
+ device: Optional[Union[str, torch.device]] = None,
+ ) -> torch.Tensor:
+ """
+ Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
+ """
+ device = device or get_torch_device("auto")
+ if len(qs) == 0:
+ raise ValueError("No queries provided")
+ if len(ps) == 0:
+ raise ValueError("No passages provided")
+ scores_list: List[torch.Tensor] = []
+ for i in range(0, len(qs), batch_size):
+ scores_batch = []
+ qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
+ device
+ )
+ for j in range(0, len(ps), batch_size):
+ ps_batch = torch.nn.utils.rnn.pad_sequence(
+ ps[j : j + batch_size], batch_first=True, padding_value=0
+ ).to(device)
+ scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
+ scores_batch = torch.cat(scores_batch, dim=1).cpu()
+ scores_list.append(scores_batch)
+ scores = torch.cat(scores_list, dim=0)
+ assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
+ scores = scores.to(torch.float32)
+ return scores