diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index 9d70c21aa8cd..a29fd436149d 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -15,7 +15,6 @@
 
 import random
 import tempfile
-import unittest
 
 import numpy as np
 import PIL
@@ -34,21 +33,16 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-class FluxModularTests:
+class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
     repo = "hf-internal-testing/tiny-flux-modular"
 
-    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
-        pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
-        pipeline.load_components(torch_dtype=torch_dtype)
-        return pipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale"])
+    batch_params = frozenset(["prompt"])
 
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
             "generator": generator,
@@ -57,36 +51,47 @@ def get_dummy_inputs(self, device, seed=0):
             "height": 8,
             "width": 8,
             "max_sequence_length": 48,
-            "output_type": "np",
+            "output_type": "pt",
         }
         return inputs
 
 
-class FluxModularPipelineFastTests(FluxModularTests, ModularPipelineTesterMixin, unittest.TestCase):
-    params = frozenset(["prompt", "height", "width", "guidance_scale"])
-    batch_params = frozenset(["prompt"])
-
+class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = FluxModularPipeline
+    pipeline_blocks_class = FluxAutoBlocks
+    repo = "hf-internal-testing/tiny-flux-modular"
 
-class FluxImg2ImgModularPipelineFastTests(FluxModularTests, ModularPipelineTesterMixin, unittest.TestCase):
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
 
     def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
         pipeline = super().get_pipeline(components_manager, torch_dtype)
+
         # Override `vae_scale_factor` here as currently, `image_processor` is initialized with
         # fixed constants instead of
         # https://github.com/huggingface/diffusers/blob/d54622c2679d700b425ad61abce9b80fc36212c0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L230C9-L232C10
         pipeline.image_processor = VaeImageProcessor(vae_scale_factor=2)
         return pipeline
 
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image / 2 + 0.5
-        inputs["image"] = image
-        inputs["strength"] = 0.8
-        inputs["height"] = 8
-        inputs["width"] = 8
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
+
+        inputs["image"] = init_image
+        inputs["strength"] = 0.5
+
         return inputs
 
     def test_save_from_pretrained(self):
@@ -96,6 +101,7 @@ def test_save_from_pretrained(self):
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             base_pipe.save_pretrained(tmpdirname)
+
             pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
             pipe.load_components(torch_dtype=torch.float32)
             pipe.to(torch_device)
@@ -105,26 +111,62 @@ def test_save_from_pretrained(self):
 
         image_slices = []
         for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
+            inputs = self.get_dummy_inputs()
             image = pipe(**inputs, output="images")
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
 
 
-class FluxKontextModularPipelineFastTests(FluxImg2ImgModularPipelineFastTests):
+class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxKontextModularPipeline
     pipeline_blocks_class = FluxKontextAutoBlocks
     repo = "hf-internal-testing/tiny-flux-kontext-pipe"
 
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "pt",
+        }
         image = PIL.Image.new("RGB", (32, 32), 0)
-        _ = inputs.pop("strength")
+
         inputs["image"] = image
-        inputs["height"] = 8
-        inputs["width"] = 8
-        inputs["max_area"] = 8 * 8
+        inputs["max_area"] = inputs["height"] * inputs["width"]
         inputs["_auto_resize"] = False
+
         return inputs
+
+    def test_save_from_pretrained(self):
+        pipes = []
+        base_pipe = self.get_pipeline().to(torch_device)
+        pipes.append(base_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            base_pipe.save_pretrained(tmpdirname)
+
+            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
+            pipe.load_components(torch_dtype=torch.float32)
+            pipe.to(torch_device)
+            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
+
+        pipes.append(pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs()
+            image = pipe(**inputs, output="images")
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index 22347aa5589c..ea54b2bdff47 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import random
-import unittest
 from typing import Any, Dict
 
 import numpy as np
@@ -32,63 +31,26 @@
 enable_full_determinism()
 
 
-class SDXLModularTests:
+class SDXLModularTesterMixin:
     """
     This mixin defines method to create pipeline, base input and base test across all SDXL modular tests.
     """
 
-    pipeline_class = StableDiffusionXLModularPipeline
-    pipeline_blocks_class = StableDiffusionXLAutoBlocks
-    repo = "hf-internal-testing/tiny-sdxl-modular"
-    params = frozenset(
-        [
-            "prompt",
-            "height",
-            "width",
-            "negative_prompt",
-            "cross_attention_kwargs",
-            "image",
-            "mask_image",
-        ]
-    )
-    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
-
-    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
-        pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
-        pipeline.load_components(torch_dtype=torch_dtype)
-        return pipeline
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
     def _test_stable_diffusion_xl_euler(self, expected_image_shape, expected_slice, expected_max_diff=1e-2):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         sd_pipe = self.get_pipeline()
-        sd_pipe = sd_pipe.to(device)
+        sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(device)
+        inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs, output="images")
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == expected_image_shape
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < expected_max_diff, (
-            "Image Slice does not match expected slice"
-        )
+        max_diff = torch.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < expected_max_diff, f"Image slice does not match expected slice. Max Difference: {max_diff}"
 
 
-class SDXLModularIPAdapterTests:
+class SDXLModularIPAdapterTesterMixin:
     """
     This mixin is designed to test IP Adapter.
     """
@@ -127,7 +89,7 @@ def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
         if "image" in parameters and "strength" in parameters:
             inputs["num_inference_steps"] = 4
 
-        inputs["output_type"] = "np"
+        inputs["output_type"] = "pt"
         return inputs
 
     def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=None):
@@ -152,7 +114,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         cross_attention_dim = pipe.unet.config.get("cross_attention_dim")
 
         # forward pass without ip adapter
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         if expected_pipe_slice is None:
             output_without_adapter = pipe(**inputs, output="images")
         else:
@@ -163,7 +125,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         pipe.unet._load_ip_adapter_weights(adapter_state_dict)
 
         # forward pass with single ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(0.0)
@@ -172,7 +134,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
             output_without_adapter_scale = output_without_adapter_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with single ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(42.0)
@@ -180,8 +142,8 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         if expected_pipe_slice is not None:
             output_with_adapter_scale = output_with_adapter_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
-        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
+        max_diff_without_adapter_scale = torch.abs(output_without_adapter_scale - output_without_adapter).max()
+        max_diff_with_adapter_scale = torch.abs(output_with_adapter_scale - output_without_adapter).max()
 
         assert max_diff_without_adapter_scale < expected_max_diff, (
             "Output without ip-adapter must be same as normal inference"
@@ -194,7 +156,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         pipe.unet._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
 
         # forward pass with multi ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([0.0, 0.0])
@@ -203,7 +165,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
             output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with multi ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([42.0, 42.0])
@@ -211,10 +173,10 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         if expected_pipe_slice is not None:
             output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_multi_adapter_scale = np.abs(
+        max_diff_without_multi_adapter_scale = torch.abs(
             output_without_multi_adapter_scale - output_without_adapter
         ).max()
-        max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
+        max_diff_with_multi_adapter_scale = torch.abs(output_with_multi_adapter_scale - output_without_adapter).max()
         assert max_diff_without_multi_adapter_scale < expected_max_diff, (
             "Output without multi-ip-adapter must be same as normal inference"
         )
@@ -223,7 +185,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         )
 
 
-class SDXLModularControlNetTests:
+class SDXLModularControlNetTesterMixin:
     """
     This mixin is designed to test ControlNet.
     """
@@ -262,24 +224,26 @@ def test_controlnet(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         pipe.set_progress_bar_config(disable=None)
 
         # forward pass without controlnet
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         output_without_controlnet = pipe(**inputs, output="images")
         output_without_controlnet = output_without_controlnet[0, -3:, -3:, -1].flatten()
 
         # forward pass with single controlnet, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         inputs["controlnet_conditioning_scale"] = 0.0
         output_without_controlnet_scale = pipe(**inputs, output="images")
         output_without_controlnet_scale = output_without_controlnet_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with single controlnet, but with scale of adapter weights
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         inputs["controlnet_conditioning_scale"] = 42.0
         output_with_controlnet_scale = pipe(**inputs, output="images")
         output_with_controlnet_scale = output_with_controlnet_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_controlnet_scale = np.abs(output_without_controlnet_scale - output_without_controlnet).max()
-        max_diff_with_controlnet_scale = np.abs(output_with_controlnet_scale - output_without_controlnet).max()
+        max_diff_without_controlnet_scale = torch.abs(
+            output_without_controlnet_scale - output_without_controlnet
+        ).max()
+        max_diff_with_controlnet_scale = torch.abs(output_with_controlnet_scale - output_without_controlnet).max()
 
         assert max_diff_without_controlnet_scale < expected_max_diff, (
             "Output without controlnet must be same as normal inference"
@@ -295,21 +259,21 @@ def test_controlnet_cfg(self):
         guider = ClassifierFreeGuidance(guidance_scale=1.0)
         pipe.update_components(guider=guider)
 
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         out_no_cfg = pipe(**inputs, output="images")
 
         # forward pass with CFG applied
         guider = ClassifierFreeGuidance(guidance_scale=7.5)
         pipe.update_components(guider=guider)
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         out_cfg = pipe(**inputs, output="images")
 
         assert out_cfg.shape == out_no_cfg.shape
-        max_diff = np.abs(out_cfg - out_no_cfg).max()
+        max_diff = torch.abs(out_cfg - out_no_cfg).max()
         assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
 
 
-class SDXLModularGuiderTests:
+class SDXLModularGuiderTesterMixin:
     def test_guider_cfg(self):
         pipe = self.get_pipeline()
         pipe = pipe.to(torch_device)
@@ -319,13 +283,13 @@ def test_guider_cfg(self):
         guider = ClassifierFreeGuidance(guidance_scale=1.0)
         pipe.update_components(guider=guider)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         out_no_cfg = pipe(**inputs, output="images")
 
         # forward pass with CFG applied
         guider = ClassifierFreeGuidance(guidance_scale=7.5)
         pipe.update_components(guider=guider)
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         out_cfg = pipe(**inputs, output="images")
 
         assert out_cfg.shape == out_no_cfg.shape
@@ -333,30 +297,57 @@ def test_guider_cfg(self):
         assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
 
 
-class SDXLModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+class TestSDXLModularPipelineFast(
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    SDXLModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL modular pipeline fast tests."""
 
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    repo = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "pt",
+        }
+        return inputs
+
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.5966781,
-                0.62939394,
-                0.48465094,
-                0.51573336,
-                0.57593524,
-                0.47035995,
-                0.53410417,
-                0.51436996,
-                0.47313565,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor(
+                [
+                    0.5966781,
+                    0.62939394,
+                    0.48465094,
+                    0.51573336,
+                    0.57593524,
+                    0.47035995,
+                    0.53410417,
+                    0.51436996,
+                    0.47313565,
+                ],
+                device=torch_device,
+            ),
             expected_max_diff=1e-2,
         )
 
@@ -364,39 +355,65 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
 
-class SDXLImg2ImgModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+class TestSDXLImg2ImgModularPipelineFast(
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    SDXLModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL image-to-image modular pipeline fast tests."""
 
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image / 2 + 0.5
-        inputs["image"] = image
-        inputs["strength"] = 0.8
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    repo = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+            "image",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt", "image"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        inputs["image"] = init_image
+        inputs["strength"] = 0.5
 
         return inputs
 
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.56943184,
-                0.4702148,
-                0.48048905,
-                0.6235963,
-                0.551138,
-                0.49629188,
-                0.60031277,
-                0.5688907,
-                0.43996853,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor(
+                [
+                    0.56943184,
+                    0.4702148,
+                    0.48048905,
+                    0.6235963,
+                    0.551138,
+                    0.49629188,
+                    0.60031277,
+                    0.5688907,
+                    0.43996853,
+                ],
+                device=torch_device,
+            ),
             expected_max_diff=1e-2,
         )
 
@@ -405,20 +422,43 @@ def test_inference_batch_single_identical(self):
 
 
 class SDXLInpaintingModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    SDXLModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL inpainting modular pipeline fast tests."""
 
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    repo = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+            "image",
+            "mask_image",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
     def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "output_type": "pt",
+        }
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
         # create mask
         image[8:, 8:, :] = 255
         mask_image = Image.fromarray(np.uint8(image)).convert("L").resize((64, 64))
@@ -431,18 +471,21 @@ def get_dummy_inputs(self, device, seed=0):
 
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.40872607,
-                0.38842705,
-                0.34893104,
-                0.47837183,
-                0.43792963,
-                0.5332134,
-                0.3716843,
-                0.47274873,
-                0.45000193,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor(
+                [
+                    0.40872607,
+                    0.38842705,
+                    0.34893104,
+                    0.47837183,
+                    0.43792963,
+                    0.5332134,
+                    0.3716843,
+                    0.47274873,
+                    0.45000193,
+                ],
+                device=torch_device,
+            ),
             expected_max_diff=1e-2,
         )
 
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index d309fcf35339..1325e5c1de3c 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -1,9 +1,7 @@
 import gc
 import tempfile
-import unittest
 from typing import Callable, Union
 
-import numpy as np
 import torch
 
 import diffusers
@@ -19,17 +17,9 @@
 )
 
 
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
 @require_torch
 class ModularPipelineTesterMixin:
     """
-    This mixin is designed to be used with unittest.TestCase classes.
     It provides a set of common tests for each modular pipeline,
     including:
     - test_pipeline_call_signature: check if the pipeline's __call__ method has all required parameters
@@ -57,9 +47,8 @@ class ModularPipelineTesterMixin:
         ]
     )
 
-    def get_generator(self, seed):
-        device = torch_device if torch_device != "mps" else "cpu"
-        generator = torch.Generator(device).manual_seed(seed)
+    def get_generator(self, seed=0):
+        generator = torch.Generator("cpu").manual_seed(seed)
         return generator
 
     @property
@@ -82,13 +71,7 @@ def pipeline_blocks_class(self) -> Union[Callable, ModularPipelineBlocks]:
             "See existing pipeline tests for reference."
         )
 
-    def get_pipeline(self):
-        raise NotImplementedError(
-            "You need to implement `get_pipeline(self)` in the child test class. "
-            "See existing pipeline tests for reference."
-        )
-
-    def get_dummy_inputs(self, device, seed=0):
+    def get_dummy_inputs(self, seed=0):
         raise NotImplementedError(
             "You need to implement `get_dummy_inputs(self, device, seed)` in the child test class. "
             "See existing pipeline tests for reference."
@@ -123,20 +106,23 @@ def batch_params(self) -> frozenset:
             "See existing pipeline tests for reference."
         )
 
-    def setUp(self):
+    def setup_method(self):
         # clean up the VRAM before each test
-        super().setUp()
         torch.compiler.reset()
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
+    def teardown_method(self):
         # clean up the VRAM after each test in case of CUDA runtime errors
-        super().tearDown()
         torch.compiler.reset()
         gc.collect()
         backend_empty_cache(torch_device)
 
+    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
+        pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
+        pipeline.load_components(torch_dtype=torch_dtype)
+        return pipeline
+
     def test_pipeline_call_signature(self):
         pipe = self.get_pipeline()
         input_parameters = pipe.blocks.input_names
@@ -156,7 +142,7 @@ def test_inference_batch_consistent(self, batch_sizes=[2], batch_generator=True)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         inputs["generator"] = self.get_generator(0)
 
         logger = logging.get_logger(pipe.__module__)
@@ -196,7 +182,7 @@ def test_inference_batch_single_identical(
         pipe = self.get_pipeline()
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
 
         # Reset generator in case it is has been used in self.get_dummy_inputs
         inputs["generator"] = self.get_generator(0)
@@ -226,10 +212,9 @@ def test_inference_batch_single_identical(
 
         assert output_batch.shape[0] == batch_size
 
-        max_diff = np.abs(to_np(output_batch[0]) - to_np(output[0])).max()
+        max_diff = torch.abs(output_batch[0] - output[0]).max()
         assert max_diff < expected_max_diff, "Batch inference results different from single inference results"
 
-    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
     @require_accelerator
     def test_float16_inference(self, expected_max_diff=5e-2):
         pipe = self.get_pipeline()
@@ -240,13 +225,13 @@ def test_float16_inference(self, expected_max_diff=5e-2):
         pipe_fp16.to(torch_device, torch.float16)
         pipe_fp16.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         # Reset generator in case it is used inside dummy inputs
         if "generator" in inputs:
             inputs["generator"] = self.get_generator(0)
         output = pipe(**inputs, output="images")
 
-        fp16_inputs = self.get_dummy_inputs(torch_device)
+        fp16_inputs = self.get_dummy_inputs()
         # Reset generator in case it is used inside dummy inputs
         if "generator" in fp16_inputs:
             fp16_inputs["generator"] = self.get_generator(0)
@@ -283,8 +268,8 @@ def test_inference_is_not_nan_cpu(self):
         pipe.set_progress_bar_config(disable=None)
         pipe.to("cpu")
 
-        output = pipe(**self.get_dummy_inputs("cpu"), output="images")
-        assert np.isnan(to_np(output)).sum() == 0, "CPU Inference returns NaN"
+        output = pipe(**self.get_dummy_inputs(), output="images")
+        assert torch.isnan(output).sum() == 0, "CPU Inference returns NaN"
 
     @require_accelerator
     def test_inference_is_not_nan(self):
@@ -292,8 +277,8 @@ def test_inference_is_not_nan(self):
         pipe.set_progress_bar_config(disable=None)
         pipe.to(torch_device)
 
-        output = pipe(**self.get_dummy_inputs(torch_device), output="images")
-        assert np.isnan(to_np(output)).sum() == 0, "Accelerator Inference returns NaN"
+        output = pipe(**self.get_dummy_inputs(), output="images")
+        assert torch.isnan(output).sum() == 0, "Accelerator Inference returns NaN"
 
     def test_num_images_per_prompt(self):
         pipe = self.get_pipeline()
@@ -309,7 +294,7 @@ def test_num_images_per_prompt(self):
 
         for batch_size in batch_sizes:
             for num_images_per_prompt in num_images_per_prompts:
-                inputs = self.get_dummy_inputs(torch_device)
+                inputs = self.get_dummy_inputs()
 
                 for key in inputs.keys():
                     if key in self.batch_params:
@@ -329,12 +314,12 @@ def test_components_auto_cpu_offload_inference_consistent(self):
 
         image_slices = []
         for pipe in [base_pipe, offload_pipe]:
-            inputs = self.get_dummy_inputs(torch_device)
+            inputs = self.get_dummy_inputs()
             image = pipe(**inputs, output="images")
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
 
     def test_save_from_pretrained(self):
         pipes = []
@@ -351,9 +336,9 @@ def test_save_from_pretrained(self):
 
         image_slices = []
         for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
+            inputs = self.get_dummy_inputs()
             image = pipe(**inputs, output="images")
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3