diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -129,13 +129,50 @@ SinkCandidate->mayReadOrWriteMemory()) continue; - // All recipe users of the sink candidate must be in the same block SinkTo. - if (any_of(SinkCandidate->users(), [SinkTo](VPUser *U) { - auto *UI = dyn_cast(U); - return !UI || UI->getParent() != SinkTo; - })) + bool NeedsDuplicating = false; + // All recipe users of the sink candidate must be in the same block SinkTo + // or all users outside of SinkTo must be uniform-after-vectorization ( + // i.e., only first lane is used) . In the latter case, we need to duplicate + // SinkCandidate. At the moment, we identify such UAV's by looking for the + // address operands of widened memory recipes. + auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, + SinkCandidate](VPUser *U) { + auto *UI = dyn_cast(U); + if (!UI) + return false; + if (UI->getParent() == SinkTo) + return true; + auto *WidenI = dyn_cast(UI); + if (WidenI && WidenI->getAddr() == SinkCandidate) { + NeedsDuplicating = true; + return true; + } + return false; + }; + if (!all_of(SinkCandidate->users(), CanSinkWithUser)) continue; + if (NeedsDuplicating) { + Instruction *I = cast(SinkCandidate->getUnderlyingValue()); + auto *Clone = + new VPReplicateRecipe(I, SinkCandidate->operands(), true, false); + // TODO: add ".cloned" suffix to name of Clone's VPValue. + + Clone->insertBefore(SinkCandidate); + SmallVector Users(SinkCandidate->user_begin(), + SinkCandidate->user_end()); + for (auto *U : Users) { + auto *UI = cast(U); + if (UI->getParent() == SinkTo) + continue; + + for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) { + if (UI->getOperand(Idx) != SinkCandidate) + continue; + UI->setOperand(Idx, Clone); + } + } + } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) WorkList.insert(std::make_pair(SinkTo, Op)); diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -555,7 +555,8 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF]]: -; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP1]], float* [[TMP2]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP1]], float* [[GEP0]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE]]: ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -124,33 +124,34 @@ ; VEC: pred.store.if: ; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 ; VEC-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP6]], 20 -; VEC-NEXT: store i32 [[TMP7]], i32* [[TMP1]], align 4 +; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP0]] +; VEC-NEXT: store i32 [[TMP7]], i32* [[TMP8]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; VEC-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if1: -; VEC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP9]] +; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 ; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP11]], 20 -; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +; VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP10]] +; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP13]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; VEC-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; VEC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; VEC: for.body: ; VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ] ; VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; VEC-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; VEC-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP14]], 100 +; VEC-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; VEC-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 100 ; VEC-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; VEC: if.then: -; VEC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], 20 +; VEC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 20 ; VEC-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 ; VEC-NEXT: br label [[FOR_INC]] ; VEC: for.inc: @@ -360,23 +361,24 @@ ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 ; VEC-NEXT: br i1 undef, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if: -; VEC-NEXT: store i32 2, i32* [[TMP5]], align 4 +; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* undef, i64 0, i64 [[TMP4]] +; VEC-NEXT: store i32 2, i32* [[TMP8]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: -; VEC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[VEC_PHI]], -; VEC-NEXT: [[PREDPHI]] = select <2 x i1> undef, <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP8]] +; VEC-NEXT: [[TMP9:%.*]] = add <2 x i32> [[VEC_PHI]], +; VEC-NEXT: [[PREDPHI]] = select <2 x i1> undef, <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP9]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC: middle.block: -; VEC-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) +; VEC-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; VEC-NEXT: [[TMP11:%.*]] = xor i1 [[CMP_N]], true -; VEC-NEXT: call void @llvm.assume(i1 [[TMP11]]) +; VEC-NEXT: [[TMP12:%.*]] = xor i1 [[CMP_N]], true +; VEC-NEXT: call void @llvm.assume(i1 [[TMP12]]) ; VEC-NEXT: br label [[SCALAR_PH]] ; VEC: scalar.ph: ; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY:%.*]] ] -; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; VEC-NEXT: br label [[FOR_BODY14:%.*]] ; VEC: for.body14: ; VEC-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -559,23 +561,24 @@ ; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 ; VEC-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32 ; VEC-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP2]], align 1 +; VEC-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] +; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP9]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.if2: -; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]] +; VEC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 ; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32 ; VEC-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8 -; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP11]], align 1 +; VEC-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]] +; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP15]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.continue3: ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; VEC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1057,15 +1057,16 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: store float 1.000000e+01, float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP0]] +; CHECK-NEXT: store float 1.000000e+01, float* [[TMP7]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP8]] -; CHECK-NEXT: store float 1.000000e+01, float* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP9]] +; CHECK-NEXT: store float 1.000000e+01, float* [[TMP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -908,7 +908,7 @@ ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: loop.header: ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%addr>, ir<%iv> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%addr>, ir<%iv> ; CHECK-NEXT: Successor(s): loop.body ; CHECK-EMPTY: ; CHECK-NEXT: loop.body: @@ -927,6 +927,7 @@ ; CHECK-NEXT: CondBit: vp<%4> (then) ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%addr>, ir<%iv> ; CHECK-NEXT: REPLICATE store ir<1.000000e+01>, ir<%gep> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: