-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VPlan] Narrow VPWidenCastRecipe to scalar cast recipe. #166514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Mel Chen (Mel-Chen) ChangesThis patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe. Based on #143552 Patch is 30.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166514.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..2740fcd1a21fc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1386,7 +1386,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
+ VPReplicateRecipe>(&R))
continue;
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1422,6 +1423,15 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}))
continue;
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
+ VPBuilder Builder(CastR);
+ auto *Clone = Builder.createScalarCast(
+ CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
+ CastR->getDebugLoc());
+ CastR->replaceAllUsesWith(Clone);
+ CastR->eraseFromParent();
+ continue;
+ }
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
RepOrWidenR->operands(),
true /*IsSingleScalar*/);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d30ab2e2..6dd4e5ac697c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -171,6 +171,10 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
return PreservesUniformity(WidenR->getOpcode()) &&
all_of(WidenR->operands(), isSingleScalar);
}
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+ return PreservesUniformity(CastR->getOpcode()) &&
+ all_of(CastR->operands(), isSingleScalar);
+ }
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b430efc9e5283..a5f7764898055 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -492,18 +492,18 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
@@ -520,7 +520,7 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 1dcd665817196..4642144f8d6d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -66,63 +66,150 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[TMP2]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE46:.*]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i8 [[TMP25]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP29]], 1
+; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1
-; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
-; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1
-; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
-; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
+; CHECK-NEXT: [[TMP114:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
-; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1
-; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
; CHECK: [[PRED_STORE_IF21]]:
-; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
-; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1
-; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK: [[PRED_STORE_IF23]]:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP41]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP42]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
+; CHECK: [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK: [[PRED_STORE_IF25]]:
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
+; CHECK-NEXT: [[TMP45:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP44]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP45]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
+; CHECK: [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT: br i1 [[TMP46]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK: [[PRED_STORE_IF27]]:
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
+; CHECK-NEXT: [[TMP77:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP76]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP77]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
+; CHECK: [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT: br i1 [[TMP49]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK: [[PRED_STORE_IF29]]:
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP50]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP51]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
+; CHECK: [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; CHECK-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
+; CHECK: [[PRED_STORE_IF31]]:
+; CHECK-NEXT: [[TMP53:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP53]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP54]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE32]]
+; CHECK: [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT: [[TMP55:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 9
+; CHECK-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
+; CHECK: [[PRED_STORE_IF33]]:
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP56]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP57]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE34]]
+; CHECK: [[PRED_STORE_CONTINUE34]]:
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 10
+; CHECK-NEXT: br i1 [[TMP58]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
+; CHECK: [[PRED_STORE_IF35]]:
+; CHECK-NEXT: [[TMP59:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
+; CHECK-NEXT: [[TMP60:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP59]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP60]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE36]]
+; CHECK: [[PRED_STORE_CONTINUE36]]:
+; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 11
+; CHECK-NEXT: br i1 [[TMP61]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
+; CHECK: [[PRED_STORE_IF37]]:
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP62]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP63]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE38]]
+; CHECK: [[PRED_STORE_CONTINUE38]]:
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 12
+; CHECK-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
+; CHECK: [[PRED_STORE_IF39]]:
+; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP65]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP66]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE40]]
+; CHECK: [[PRED_STORE_CONTINUE40]]:
+; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 13
+; CHECK-NEXT: br i1 [[TMP67]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
+; CHECK: [[PRED_STORE_IF41]]:
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP68]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP69]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE42]]
+; CHECK: [[PRED_STORE_CONTINUE42]]:
+; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 14
+; CHECK-NEXT: br i1 [[TMP70]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
+; CHECK: [[PRED_STORE_IF43]]:
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP71]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP72]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE44]]
+; CHECK: [[PRED_STORE_CONTINUE44]]:
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 15
+; CHECK-NEXT: br i1 [[TMP73]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46]]
+; CHECK: [[PRED_STORE_IF45]]:
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP74]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP75]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]]
+; CHECK: [[PRED_STORE_CONTINUE46]]:
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP48:%.*]] = xor i1 [[TMP47]], true
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 64)
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/L...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThis patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe. Based on #143552 Patch is 30.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166514.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..2740fcd1a21fc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1386,7 +1386,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
+ VPReplicateRecipe>(&R))
continue;
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1422,6 +1423,15 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}))
continue;
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
+ VPBuilder Builder(CastR);
+ auto *Clone = Builder.createScalarCast(
+ CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
+ CastR->getDebugLoc());
+ CastR->replaceAllUsesWith(Clone);
+ CastR->eraseFromParent();
+ continue;
+ }
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
RepOrWidenR->operands(),
true /*IsSingleScalar*/);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d30ab2e2..6dd4e5ac697c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -171,6 +171,10 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
return PreservesUniformity(WidenR->getOpcode()) &&
all_of(WidenR->operands(), isSingleScalar);
}
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+ return PreservesUniformity(CastR->getOpcode()) &&
+ all_of(CastR->operands(), isSingleScalar);
+ }
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b430efc9e5283..a5f7764898055 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -492,18 +492,18 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
@@ -520,7 +520,7 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 1dcd665817196..4642144f8d6d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -66,63 +66,150 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[TMP2]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE46:.*]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i8 [[TMP25]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP29]], 1
+; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1
-; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
-; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1
-; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
-; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
+; CHECK-NEXT: [[TMP114:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
-; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1
-; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
; CHECK: [[PRED_STORE_IF21]]:
-; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
-; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1
-; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK: [[PRED_STORE_IF23]]:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP41]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP42]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
+; CHECK: [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK: [[PRED_STORE_IF25]]:
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
+; CHECK-NEXT: [[TMP45:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP44]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP45]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
+; CHECK: [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT: br i1 [[TMP46]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK: [[PRED_STORE_IF27]]:
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
+; CHECK-NEXT: [[TMP77:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP76]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP77]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
+; CHECK: [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT: br i1 [[TMP49]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK: [[PRED_STORE_IF29]]:
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP50]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP51]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
+; CHECK: [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; CHECK-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
+; CHECK: [[PRED_STORE_IF31]]:
+; CHECK-NEXT: [[TMP53:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP53]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP54]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE32]]
+; CHECK: [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT: [[TMP55:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 9
+; CHECK-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
+; CHECK: [[PRED_STORE_IF33]]:
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP56]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP57]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE34]]
+; CHECK: [[PRED_STORE_CONTINUE34]]:
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 10
+; CHECK-NEXT: br i1 [[TMP58]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
+; CHECK: [[PRED_STORE_IF35]]:
+; CHECK-NEXT: [[TMP59:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
+; CHECK-NEXT: [[TMP60:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP59]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP60]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE36]]
+; CHECK: [[PRED_STORE_CONTINUE36]]:
+; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 11
+; CHECK-NEXT: br i1 [[TMP61]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
+; CHECK: [[PRED_STORE_IF37]]:
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP62]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP63]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE38]]
+; CHECK: [[PRED_STORE_CONTINUE38]]:
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 12
+; CHECK-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
+; CHECK: [[PRED_STORE_IF39]]:
+; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP65]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP66]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE40]]
+; CHECK: [[PRED_STORE_CONTINUE40]]:
+; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 13
+; CHECK-NEXT: br i1 [[TMP67]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
+; CHECK: [[PRED_STORE_IF41]]:
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP68]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP69]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE42]]
+; CHECK: [[PRED_STORE_CONTINUE42]]:
+; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 14
+; CHECK-NEXT: br i1 [[TMP70]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
+; CHECK: [[PRED_STORE_IF43]]:
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP71]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP72]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE44]]
+; CHECK: [[PRED_STORE_CONTINUE44]]:
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 15
+; CHECK-NEXT: br i1 [[TMP73]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46]]
+; CHECK: [[PRED_STORE_IF45]]:
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP74]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP75]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]]
+; CHECK: [[PRED_STORE_CONTINUE46]]:
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP48:%.*]] = xor i1 [[TMP47]], true
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 64)
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/L...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would have thought that a VPWidenCastRecipe could have been replicated with its original underlying instruction, but I see now that it's sometimes used for synthetic instructions with no underlying value. We should probably finish moving those over to VPInstructionWithType
| if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe, | ||
| VPWidenSelectRecipe>(VPV)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can add WidenCast to this isa.
| CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(), | ||
| CastR->getDebugLoc()); | ||
| CastR->replaceAllUsesWith(Clone); | ||
| CastR->eraseFromParent(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need this: remove deadRecipes should automatically take care of this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
simplifyRecipes runs after this before removeDeadRecipes, which is sensitive to the number of users. So we should remove the dead recipes if we can
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can then guard it with isDeadRecipe as it is done for the replicate below?
| if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) { | ||
| VPBuilder Builder(CastR); | ||
| auto *Clone = Builder.createScalarCast( | ||
| CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(), | ||
| CastR->getDebugLoc()); | ||
| CastR->replaceAllUsesWith(Clone); | ||
| CastR->eraseFromParent(); | ||
| continue; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think vputils::getSingleScalarClone would be cleaner, but I'm happy to land this first. #161667.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you dropped the flags and metadata.
This patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe.
Based on #143552