Skip to content

Conversation

@Mel-Chen
Copy link
Contributor

Pre-commit test #143498

@llvmbot
Copy link
Member

llvmbot commented Jun 10, 2025

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Mel Chen (Mel-Chen)

Changes

Pre-commit test #143498


Full diff: https://github.com/llvm/llvm-project/pull/143552.diff

4 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+10-3)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.h (+5)
  • (modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+1-1)
  • (added) llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll (+70)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc3c7bfe5cd1a..c91ca9b90e90b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1180,6 +1180,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
 
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   // Try to narrow wide and replicating recipes to single scalar recipes,
   // based on VPlan analysis. Only process blocks in the loop region for now,
   // without traversing into nested regions, as recipes in replicate regions
@@ -1194,13 +1195,19 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
 
       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      Instruction *UI = RepOrWidenR->getUnderlyingInstr();
+      if (!UI)
+	continue;
+
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
-          any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
-            return !U->usesScalars(RepOrWidenR);
-          }))
+          any_of(RepOrWidenR->users(),
+                 [RepOrWidenR](VPUser *U) {
+                   return !U->usesScalars(RepOrWidenR);
+                 }) ||
+          UI->getType() != TypeInfo.inferScalarType(RepOrWidenR))
         continue;
 
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8a4e6fda89c12..7fc801a466dce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -75,6 +75,11 @@ inline bool isSingleScalar(const VPValue *VPV) {
     return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
+  if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+    return PreservesUniformity(CastR->getOpcode()) &&
+           all_of(CastR->operands(), isSingleScalar);
+  }
+
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2c6fe4f5c808e..641ed90b6de34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -379,7 +379,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
new file mode 100644
index 0000000000000..1ac4fe70921e6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
+; CHECK-LABEL: define void @minbw_cast(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[BOOL1:%.*]], i1 [[BOOL2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BOOL1_EXT:%.*]] = zext i1 [[BOOL1]] to i32
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BOOL2_EXT:%.*]] = zext i1 [[BOOL2]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[BOOL1_EXT]], [[BOOL2_EXT]]
+; CHECK-NEXT:    [[XOR_TRUNC:%.*]] = trunc i32 [[XOR]] to i8
+; CHECK-NEXT:    store i8 [[XOR_TRUNC]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bool1.ext = zext i1 %bool1 to i32
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %bool2.ext = zext i1 %bool2 to i32
+  %xor = xor i32 %bool1.ext, %bool2.ext
+  %xor.trunc = trunc i32 %xor to i8
+  store i8 %xor.trunc, ptr %dst, align 1
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

@github-actions
Copy link

github-actions bot commented Jun 10, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

@Mel-Chen
Copy link
Contributor Author

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

@Mel-Chen
Copy link
Contributor Author

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

Yes, #140623 is necessary if we want to solve the issue by emitting a VPInstruction, as I’m currently trying to do.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

When execution the recipe, the type of VPWidenRecipe isn’t derived from the underlying value, but it seems to default to a vector type. However, what we need to produce here is actually a scalar type.

; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is caused by that we haven’t narrowed VPWidenCastRecipe yet.
#166514 improves the issue.
I’m not sure whether we actually need to split it into two patches (this and #166514), or if we should just go straight to applying the latter one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is still probably profitable for most targets, happy to land this first and then further improve it in #166514

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extracts from vector to integer scalar are very expensive on AArch64 in general I think, so any cases where we introduce additional extracts are likely to have a negative impact. If we can avoid those together with #166514, then that's preferable.

Copy link
Contributor

@lukel97 lukel97 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, just a nit

all_of(Rep->operands(), isSingleScalar));
}
if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
VPWidenSelectRecipe>(VPV))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A VPWidenCastRecipe can only ever have an Instruction::CastOps opcode so PreservesUniformity will always return true, can we just move it into the case here?

; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is still probably profitable for most targets, happy to land this first and then further improve it in #166514

Copy link
Contributor

@artagnon artagnon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a similar patch at #162340, but @fhahn pointed out the regression. I now see your follow-up, and probably we can merge that with tweaks?

@lukel97
Copy link
Contributor

lukel97 commented Nov 5, 2025

I had a similar patch at #162340, but @fhahn pointed out the regression. I now see your follow-up, and probably we can merge that with tweaks?

Oh woops I didn't notice your PR sorry about that. But as for the test change, I'm pretty sure VectorCombine will do some similar sort of similar scalarization anyway. So we might as well model it in VPlan too.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants