[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Mel-Chen · 2025-06-10T15:24:30Z

Pre-commit test #143498

llvmbot · 2025-06-10T15:25:08Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Mel Chen (Mel-Chen)

Changes

Pre-commit test #143498

Full diff: https://github.com/llvm/llvm-project/pull/143552.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+10-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanUtils.h (+5)
(modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+1-1)
(added) llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll (+70)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc3c7bfe5cd1a..c91ca9b90e90b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1180,6 +1180,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
 
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   // Try to narrow wide and replicating recipes to single scalar recipes,
   // based on VPlan analysis. Only process blocks in the loop region for now,
   // without traversing into nested regions, as recipes in replicate regions
@@ -1194,13 +1195,19 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
 
       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      Instruction *UI = RepOrWidenR->getUnderlyingInstr();
+      if (!UI)
+	continue;
+
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
-          any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
-            return !U->usesScalars(RepOrWidenR);
-          }))
+          any_of(RepOrWidenR->users(),
+                 [RepOrWidenR](VPUser *U) {
+                   return !U->usesScalars(RepOrWidenR);
+                 }) ||
+          UI->getType() != TypeInfo.inferScalarType(RepOrWidenR))
         continue;
 
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8a4e6fda89c12..7fc801a466dce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -75,6 +75,11 @@ inline bool isSingleScalar(const VPValue *VPV) {
     return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
+  if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+    return PreservesUniformity(CastR->getOpcode()) &&
+           all_of(CastR->operands(), isSingleScalar);
+  }
+
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2c6fe4f5c808e..641ed90b6de34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -379,7 +379,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
new file mode 100644
index 0000000000000..1ac4fe70921e6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
+; CHECK-LABEL: define void @minbw_cast(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[BOOL1:%.*]], i1 [[BOOL2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BOOL1_EXT:%.*]] = zext i1 [[BOOL1]] to i32
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BOOL2_EXT:%.*]] = zext i1 [[BOOL2]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[BOOL1_EXT]], [[BOOL2_EXT]]
+; CHECK-NEXT:    [[XOR_TRUNC:%.*]] = trunc i32 [[XOR]] to i8
+; CHECK-NEXT:    store i8 [[XOR_TRUNC]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bool1.ext = zext i1 %bool1 to i32
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %bool2.ext = zext i1 %bool2 to i32
+  %xor = xor i32 %bool1.ext, %bool2.ext
+  %xor.trunc = trunc i32 %xor to i8
+  store i8 %xor.trunc, ptr %dst, align 1
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

github-actions · 2025-06-10T15:26:55Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

fhahn

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Mel-Chen · 2025-06-17T09:37:28Z

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

fhahn

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

Mel-Chen · 2025-06-17T10:28:47Z

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

Yes, #140623 is necessary if we want to solve the issue by emitting a VPInstruction, as I’m currently trying to do.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

When execution the recipe, the type of VPWidenRecipe isn’t derived from the underlying value, but it seems to default to a vector type. However, what we need to produce here is actually a scalar type.

Mel-Chen · 2025-11-05T07:24:15Z

llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll

-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0


This is caused by that we haven’t narrowed VPWidenCastRecipe yet.
#166514 improves the issue.
I’m not sure whether we actually need to split it into two patches (this and #166514), or if we should just go straight to applying the latter one.

I think this is still probably profitable for most targets, happy to land this first and then further improve it in #166514

Extracts from vector to integer scalar are very expensive on AArch64 in general I think, so any cases where we introduce additional extracts are likely to have a negative impact. If we can avoid those together with #166514, then that's preferable.

lukel97

LGTM, just a nit

lukel97 · 2025-11-05T08:31:30Z

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

                                     all_of(Rep->operands(), isSingleScalar));
  }
  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
          VPWidenSelectRecipe>(VPV))


A VPWidenCastRecipe can only ever have an Instruction::CastOps opcode so PreservesUniformity will always return true, can we just move it into the case here?

lukel97 · 2025-11-05T08:32:39Z

llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll

-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0


I think this is still probably profitable for most targets, happy to land this first and then further improve it in #166514

artagnon

I had a similar patch at #162340, but @fhahn pointed out the regression. I now see your follow-up, and probably we can merge that with tweaks?

lukel97 · 2025-11-05T15:46:23Z

I had a similar patch at #162340, but @fhahn pointed out the regression. I now see your follow-up, and probably we can merge that with tweaks?

Oh woops I didn't notice your PR sorry about that. But as for the test change, I'm pretty sure VectorCombine will do some similar sort of similar scalarization anyway. So we might as well model it in VPlan too.

Mel-Chen requested review from alexey-bataev, artagnon, david-arm, fhahn and juliannagele June 10, 2025 15:24

llvmbot added vectorizers llvm:transforms labels Jun 10, 2025

Mel-Chen force-pushed the isSingleScalar-cast branch from 9468b17 to f32401e Compare June 10, 2025 15:32

Mel-Chen mentioned this pull request Jun 10, 2025

[VPlan] Support VPWidenCastRecipe in narrowToSingleScalarRecipes. #141080

Open

alexey-bataev reviewed Jun 10, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp Show resolved Hide resolved

fhahn reviewed Jun 10, 2025

View reviewed changes

fhahn reviewed Jun 17, 2025

View reviewed changes

Mel-Chen added 2 commits November 4, 2025 00:28

IsSingleScalar

8ad354c

rebase and update

1200453

Mel-Chen force-pushed the isSingleScalar-cast branch from f32401e to 1200453 Compare November 4, 2025 08:46

Mel-Chen mentioned this pull request Nov 5, 2025

[VPlan] Narrow VPWidenCastRecipe to scalar cast recipe. #166514

Open

Mel-Chen commented Nov 5, 2025

View reviewed changes

Mel-Chen requested review from alexey-bataev and fhahn November 5, 2025 07:24

lukel97 approved these changes Nov 5, 2025

View reviewed changes

artagnon reviewed Nov 5, 2025

View reviewed changes

alexey-bataev approved these changes Nov 5, 2025

View reviewed changes

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Are you sure you want to change the base?

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Conversation

Mel-Chen commented Jun 10, 2025

Uh oh!

llvmbot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

Mel-Chen Nov 5, 2025

Choose a reason for hiding this comment

Uh oh!

lukel97 Nov 5, 2025

Choose a reason for hiding this comment

Uh oh!

fhahn Nov 5, 2025

Choose a reason for hiding this comment

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

lukel97 Nov 5, 2025

Choose a reason for hiding this comment

Uh oh!

lukel97 Nov 5, 2025

Choose a reason for hiding this comment

Uh oh!

artagnon left a comment

Choose a reason for hiding this comment

Uh oh!

lukel97 commented Nov 5, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

llvmbot commented Jun 10, 2025 •

edited

Loading

github-actions bot commented Jun 10, 2025 •

edited

Loading