Merge pull request #12 from bladeszasza/chore/add-multi-color-segmentation-layers

bladeszasza · web-flow · commit de8d778098ba · 2025-05-16T23:36:17.000+02:00
Chore/add multi color segmentation layers
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.10", "3.11"]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/README.md b/README.md
@@ -30,6 +30,8 @@ Given one or more text prompts (e.g., `"a red bicycle"`, or `"cat" "dog"`) and a
 2.  Employ **SAM 2** to generate detailed segmentation masks for each detected object, leveraging techniques from the paper [SAM 2: Segment Anything in Images and Videos](https://arxiv.org/abs/2408.00714).
 3.  Save both **binary segmentation masks** (foreground vs. background) and **overlay images** (original image with masks visually overlaid) to a specified output directory.
 
+![Multilabel output showcase](./assets/SOWLv2Multilabel.png "Multilabel Output Showcase")
+
 ## ✨ Key Features
 
 *   **Text-Prompted Segmentation:** Identify and segment objects using free-form text descriptions.
@@ -130,6 +132,8 @@ The tool saves results in the specified output directory. For each detected obje
 
 Objects are numbered sequentially (e.g., `object0`, `object1`) in the order they are detected by OWLv2, regardless of which text prompt they matched. For video inputs, output filenames will also include frame identifiers, and separate videos for each object's masks and overlays will be generated (e.g., `obj0_mask_video.mp4`, `obj0_overlay_video.mp4`).
 
+SOWLv2 automatically assigns a unique color to each detected OWLv2 label, making it easy to visually distinguish different object classes in the output overlays and merged results.
+
 ### <a name="configuration"></a>Configuration File (Optional):
 
 You can use a YAML configuration file to specify arguments, which is useful for managing complex settings or reproducing experiments. The `prompt` field in the YAML file can also be a list of strings.
diff --git a/assets/SOWLv2Multilabel.png b/assets/SOWLv2Multilabel.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sowlv2"
-version = "1.0.0"
+version = "0.2.0"
 authors = [
     { name="Csaba Bolyos", email="bladeszasza@gmail.com" },
 ]
@@ -35,4 +35,4 @@ Homepage = "https://github.com/bladeszasza/SOWLv2"
 Issues = "https://github.com/bladeszasza/SOWLv2/issues"
 
 [project.scripts]
-sowlv2-detect = "sowlv2.cli:main"
+sowlv2-detect = "sowlv2.cli:main"
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="sowlv2",
-    version="1.0.0",
+    version="0.2.0",
     description="SOWLv2: Text-prompted object segmentation using OWLv2 and SAM 2",
     author="Bolyos Csaba",
     author_email="bladeszasza@gmail.com",
diff --git a/sowlv2/data/config.py b/sowlv2/data/config.py
@@ -1,9 +1,10 @@
 """
 Dataclasses for configuring the SOWLv2 object detection and segmentation pipeline.
 """
-
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Tuple, List, Dict
+import numpy as np
+from PIL import Image
 
 @dataclass
 class PipelineBaseData:
@@ -20,9 +21,131 @@ class PipelineBaseData:
 class SaveMaskOverlayConfig:
     """
     Configuration for saving masks and overlays for a frame.
+
+    Attributes:
+        pil_img (Any): The PIL image for the frame.
+        frame_idx (int): The frame index.
+        obj_ids (Any): Object IDs for the masks.
+        masks (Any): Masks for the objects.
+        out_dir (str): Output directory for saving results.
     """
     pil_img: Any
     frame_idx: int
     obj_ids: Any
     masks: Any
     out_dir: str
+
+@dataclass
+class MaskObject:
+    """
+    Stores the mask and its properties for a detected object.
+
+    Attributes:
+        mask_np (np.ndarray): The mask as a NumPy array.
+        mask_img_pil (Image.Image): The mask as a PIL image.
+        mask_file (str): Path to the saved mask file.
+    """
+    mask_np: np.ndarray
+    mask_img_pil: Image.Image
+    mask_file: str
+
+@dataclass
+class DetectionResult:
+    """
+    Stores the result of a single detection and segmentation.
+
+    Attributes:
+        box (Any): The bounding box for the detected object.
+        core_prompt (str): The core prompt/label for the object.
+        object_color (Tuple[int, int, int]): The assigned color for the object.
+        mask_np (np.ndarray): The segmentation mask as a NumPy array.
+        mask_img_pil (Image.Image): The mask as a PIL image.
+        mask_file (str): Path to the saved mask file.
+        individual_overlay_pil (Image.Image): The overlay as a PIL image.
+        overlay_file (str): Path to the saved overlay file.
+    """
+    box: Any
+    core_prompt: str
+    object_color: Tuple[int, int, int]
+    mask : MaskObject
+    individual_overlay_pil: Image.Image
+    overlay_file: str
+
+@dataclass
+class MergedOverlayItem:
+    """
+    Stores information for a single mask/color/label used in a merged overlay.
+
+    Attributes:
+        mask (np.ndarray): Boolean mask for the object.
+        color (Tuple[int, int, int]): Color assigned to the object.
+        label (str): The core prompt/label for the object.
+    """
+    mask: np.ndarray
+    color: Tuple[int, int, int]
+    label: str
+
+@dataclass
+class VideoDetectionDetail:
+    """
+    Stores details for each detected object in a video, including its SAM object ID,
+    the core prompt/label, and the assigned color for consistent visualization.
+
+    Attributes:
+        sam_id (int): The unique SAM object ID assigned for tracking in the video.
+        core_prompt (str): The core prompt/label for the detected object.
+        color (Tuple[int, int, int]): The RGB color assigned to this object for overlays.
+    """
+    sam_id: int
+    core_prompt: str
+    color: Tuple[int, int, int]
+
+@dataclass
+class PropagatedFrameOutput:
+    """
+    Stores all relevant data for a single propagated frame in video segmentation.
+
+    Attributes:
+        current_pil_img (Image.Image): The current frame as a PIL image.
+        frame_num (int): The frame number (1-based).
+        sam_obj_ids_tensor (Any): Tensor or list of SAM object IDs for this frame.
+        mask_logits_tensor (Any): Tensor of mask logits for this frame.
+        detection_details_map (List[Dict[str, Any]]): List of detection details for mapping IDs.
+        output_dir (str): Output directory for saving results.
+    """
+    current_pil_img: Image.Image
+    frame_num: int
+    sam_obj_ids_tensor: Any
+    mask_logits_tensor: Any
+    detection_details_map: List[Dict[str, Any]]
+    output_dir: str
+
+@dataclass
+class SingleDetectionInput:
+    """
+    Stores all relevant data for processing a single detection in an image.
+
+    Attributes:
+        pil_image (Image.Image): The PIL image being processed.
+        detection_detail (dict): The detection dictionary for the object.
+        obj_idx (int): The index of the object in the detections list.
+        base_name (str): The base name of the input image file.
+        output_dir (str): Output directory for saving results.
+    """
+    pil_image: Image.Image
+    detection_detail: dict
+    obj_idx: int
+    base_name: str
+    output_dir: str
+
+@dataclass
+class VideoProcessContext:
+    """
+    Holds all context/state for processing a video in SOWLv2Pipeline.
+    """
+    tmp_frames_dir: str
+    initial_sam_state: Any
+    first_img_path: str
+    first_pil_img: Image.Image
+    detection_details_for_video: List[Dict[str, Any]]
+    updated_sam_state: Any
diff --git a/sowlv2/owl.py b/sowlv2/owl.py
@@ -1,69 +1,139 @@
 """
 Wrapper for OWLv2 text-conditioned object detection models from HuggingFace Transformers.
 """
-from typing import Union, List
+from typing import Union, List, Dict, Any
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 import torch
 
-class OWLV2Wrapper:  # pylint: disable=too-few-public-methods
-    """Wrapper for OWLv2 text-conditioned object detection."""
-    def __init__(self, model_name="google/owlv2-base-patch16-ensemble", device="cpu"):
+# It's a focused wrapper, so R0903 (too-few-public-methods) might be flagged
+# but is acceptable for this type of class. We can add the disable if Pylint complains.
+# pylint: disable=R0903
+
+class OWLV2Wrapper:
+    """
+    Wrapper for OWLv2 text-conditioned object detection.
+
+    This class handles the loading of OWLv2 models and processors,
+    and provides a method to detect objects based on text prompts.
+    It formats the output to include both the full label matched by OWLv2
+    and the original "core" prompt term provided by the user.
+    """
+    def __init__(self, model_name: str ="google/owlv2-base-patch16-ensemble", device: str = "cpu"):
+        """
+        Initialize the OWLV2Wrapper.
+
+        Args:
+            model_name (str): The Hugging Face model identifier for OWLv2.
+            device (str): The device to run the model on (e.g., "cpu", "cuda").
+        """
         self.device = device
         self.processor: Owlv2Processor = Owlv2Processor.from_pretrained(model_name)
-        self.model = Owlv2ForObjectDetection.from_pretrained(model_name).to(device)
+        self.model: Owlv2ForObjectDetection = Owlv2ForObjectDetection.from_pretrained(
+            model_name).to(
+            self.device
+        )
 
-    def detect(self, *, image, prompt: Union[str, List[str]], threshold=0.1):
+    def detect(self, *, image: Any, prompt: Union[str, List[str]], threshold: float = 0.1
+               ) -> List[Dict[str, Any]]:
         """
-        Detect objects in the image matching the text prompt.
-        Returns a list of dict with keys: box, score, label.
+        Detect objects in the image matching the text prompt(s).
+
+        Args:
+            image (Any): The input image (e.g., a PIL Image).
+            prompt (Union[str, List[str]]): A single text prompt or a list of text prompts.
+            threshold (float): The confidence threshold for detections.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, where each dictionary
+            represents a detected object and contains 'box', 'score', 'label'
+            (the full text matched by OWLv2), and 'core_prompt' (the original
+            user-provided term that led to this detection).
         """
         if isinstance(prompt, str):
-            processed_prompts = [f"a photo of {prompt}"]
-        else: # prompt is a list of strings
-            processed_prompts = [f"a photo of {p}" for p in prompt]
+            original_prompt_terms: List[str] = [prompt]
+        else:
+            original_prompt_terms: List[str] = prompt
 
-        text_labels = [processed_prompts] # Batch size of 1, with potentially multiple queries
+        # OWLv2 typically expects prompts like "a photo of <object>"
+        processed_prompts_for_owl: List[str] = [
+            f"a photo of {p}" for p in original_prompt_terms
+        ]
+        # The 'text' argument to the processor for multiple queries on a single image
+        # should be List[List[str]], where the outer list is for batch items.
+        text_labels_for_owl: List[List[str]] = [processed_prompts_for_owl]
 
         inputs = self.processor(
-            text=text_labels, images=image, return_tensors="pt"
+            text=text_labels_for_owl, images=image, return_tensors="pt"
         ).to(self.device)
 
         with torch.no_grad():
             outputs = self.model(**inputs)
 
-        target_sizes = torch.tensor([(image.height, image.width)]).to(self.device)
+        # target_sizes should be a tensor of shape (batch_size, 2)
+        target_sizes = torch.tensor([image.size[::-1]], device=self.device)
+
+        # Pass text_labels_for_owl to post_process for correct label association
         results = self.processor.post_process_grounded_object_detection(
-            outputs=outputs, target_sizes=target_sizes,
-            threshold=threshold, text_labels=text_labels
+            outputs=outputs,
+            target_sizes=target_sizes,
+            threshold=threshold,
+            text_labels=text_labels_for_owl
         )
 
-        # Determine the list of original prompt terms for fallback in _format_detections
-        if isinstance(prompt, str):
-            fallback_labels = [prompt]
-        else:
-            fallback_labels = prompt
-
-        return self._format_detections(results, fallback_labels)
+        return self._format_detections(results, original_prompt_terms)
 
-    def _format_detections(self, results, fallback_prompts: List[str]):
+    def _format_detections(self, results: List[Dict[str, Any]],
+                           original_prompt_terms: List[str]) -> List[Dict[str, Any]]:
         """
-        Helper to format detection results into a list of dicts.
-        fallback_prompts: The list of original prompt terms used for searching.
+        Helper to format raw detection results into a structured list of dictionaries.
+
+        Args:
+            results (List[Dict[str, Any]]): Raw results from the OWLv2 processor's
+                                            post_process_grounded_object_detection method.
+            original_prompt_terms (List[str]): The list of original, user-provided
+                                               prompt terms (e.g., ["cat", "dog"]).
+
+        Returns:
+            List[Dict[str, Any]]: Formatted list of detections.
         """
-        detections = []
-        if results and results[0]:
-            result = results[0]
-            boxes = result["boxes"].cpu().numpy()
-            scores = result["scores"].cpu().numpy()
-            # 'text_labels' in result should be populated by post_process_object_detection
-            # with the specific query that matched each box (e.g., "a photo of cat").
-            # The processor.post_process_grounded_object_detection
-            # returns the text_labels as they were passed in.
-            returned_labels = result.get("text_labels", fallback_prompts * len(boxes)) # Fallback
-            for box, score, label_text in zip(boxes, scores, returned_labels):
-                detections.append({
-                    "box": [float(coord) for coord in box],
-                    "score": float(score),
-                    "label": label_text 
-                })
+        detections: List[Dict[str, Any]] = []
+        if not results or not results[0]:
+            return detections
+
+        # Results is a list (batch), we typically process one image at a time here.
+        first_image_results = results[0]
+        boxes = first_image_results["boxes"].cpu().numpy()
+        scores = first_image_results["scores"].cpu().numpy()
+
+        # 'labels' are integer indices into the list of queries *for the current image*
+        # that were passed to post_process_grounded_object_detection
+        # (i.e., text_labels_for_owl[0]).
+        prompt_indices = first_image_results.get(
+            "labels", torch.zeros(len(boxes), dtype=torch.long)
+        ).cpu().numpy()
+
+        # 'text_labels' from results should be the actual prompt strings that matched.
+        owl_matched_full_labels = first_image_results.get("text_labels", [])
+
+        for i, (current_box, current_score) in enumerate(zip(boxes, scores)):
+            try:
+                core_prompt = original_prompt_terms[prompt_indices[i]]
+            except IndexError:
+                core_prompt = "unknown_prompt_term"
+                print(
+                    f"Warning: Index {prompt_indices[i]} out of bounds"
+                )
+
+            full_owl_label = (
+                owl_matched_full_labels[i]
+                if i < len(owl_matched_full_labels)
+                else f"a photo of {core_prompt}"
+            )
+
+            detections.append({
+                "box": [float(coord) for coord in current_box],
+                "score": float(current_score),
+                "label": full_owl_label,
+                "core_prompt": core_prompt
+            })
         return detections
diff --git a/sowlv2/pipeline.py b/sowlv2/pipeline.py