diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -178,6 +178,8 @@ ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); } }; + + DenseMap BB2VPBB; }; /// TODO: The following VectorizationFactor was pulled out of diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8128,6 +8128,11 @@ VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); + VPBuilder::InsertPointGuard Guard(Builder); + VPBasicBlock *SrcVPBB = Builder.BB2VPBB[Src]; + assert(SrcVPBB && "Cannot find corresponding VPBB for the BB"); + Builder.setInsertPoint(SrcVPBB, SrcVPBB->end()); + if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); @@ -8203,6 +8208,11 @@ continue; } + VPBuilder::InsertPointGuard Guard(Builder); + VPBasicBlock *VPBB = Builder.BB2VPBB[BB]; + assert(VPBB && "Cannot find corresponding VPBB for the BB"); + Builder.setInsertPoint(VPBB, VPBB->end()); + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } @@ -8924,6 +8934,8 @@ VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); + Builder.BB2VPBB[BB]= VPBB; + // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug(false)) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -181,9 +181,9 @@ ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; TFNONE-NEXT: [[TMP5:%.*]] = icmp ugt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP5]]) -; TFNONE-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], zeroinitializer, [[TMP6]] +; TFNONE-NEXT: [[TMP6:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP5]]) +; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], zeroinitializer, [[TMP7]] ; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 ; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() @@ -236,12 +236,12 @@ ; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFALWAYS-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) -; TFALWAYS-NEXT: [[TMP9:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFALWAYS-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], zeroinitializer, [[TMP8]] +; TFALWAYS-NEXT: [[TMP8:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFALWAYS-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; TFALWAYS-NEXT: [[TMP10:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, [[TMP10]] ; TFALWAYS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP10]] +; TFALWAYS-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP9]] ; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP11]], i32 8, [[TMP12]]) ; TFALWAYS-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 @@ -295,12 +295,12 @@ ; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) -; TFFALLBACK-NEXT: [[TMP9:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFFALLBACK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], zeroinitializer, [[TMP8]] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFFALLBACK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; TFFALLBACK-NEXT: [[TMP10:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, [[TMP10]] ; TFFALLBACK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP10]] +; TFFALLBACK-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP9]] ; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP11]], i32 8, [[TMP12]]) ; TFFALLBACK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 @@ -441,12 +441,12 @@ ; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP7:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; TFALWAYS-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) -; TFALWAYS-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFALWAYS-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP10]]) -; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP9]], [[TMP11]] +; TFALWAYS-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; TFALWAYS-NEXT: [[TMP10:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) +; TFALWAYS-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP9]]) +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP10]], [[TMP11]] ; TFALWAYS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP10]] +; TFALWAYS-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP9]] ; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[TMP13]]) ; TFALWAYS-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 @@ -504,12 +504,12 @@ ; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP7:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; TFFALLBACK-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) -; TFFALLBACK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFFALLBACK-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP10]]) -; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP9]], [[TMP11]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; TFFALLBACK-NEXT: [[TMP10:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP9]]) +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP10]], [[TMP11]] ; TFFALLBACK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP10]] +; TFFALLBACK-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP9]] ; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[TMP13]]) ; TFFALLBACK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1119,11 +1119,11 @@ ; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, [[TMP7]], poison) +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] ; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[PREDPHI]] ; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 @@ -1180,11 +1180,11 @@ ; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, [[TMP7]], poison) +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] ; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) ; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 @@ -1248,14 +1248,14 @@ ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP18]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP15]], [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP17]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, [[TMP14]], poison) +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP14]], [[TMP16]] ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED @@ -5,32 +6,106 @@ ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[LOAD_VEC]], %[[VEC_PHI]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP6]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-ORDERED-NEXT: [[TMP3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP5]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -51,32 +126,106 @@ ; Same as above but where fadd has a fast-math flag. define float @fadd_strict_fmf(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_fmf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[RDX]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[LOAD_VEC]]) -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_fmf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FADD_VEC:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[FADD_VEC]] = fadd nnan <8 x float> [[LOAD_VEC]], [[VEC_PHI]] -; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[FADD_VEC]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-UNORDERED: [[FADD:%.*]] = fadd nnan float [[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_fmf +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_fmf +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3]] = fadd nnan <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP6]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_fmf +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-ORDERED-NEXT: [[TMP3]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP5]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -96,51 +245,142 @@ } define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_unroll -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) -; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) -; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_unroll -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]] -; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]] -; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_unroll +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd <8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd <8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP15]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP18]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP12]], <8 x float> [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP13]], <8 x float> [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP14]], <8 x float> [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -168,63 +408,166 @@ ; return sum; define float @fadd_strict_unroll_last_val(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) -; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) -; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ] -; CHECK-ORDERED: %[[LOAD5:.*]] = load float, ptr -; CHECK-ORDERED: %[[FADD]] = fadd float %[[SUM_PHI]], %[[LOAD5]] -; CHECK-ORDERED: for.cond.cleanup -; CHECK-ORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ] -; CHECK-ORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01 -; CHECK-ORDERED: store float %[[FADD_42]], ptr %b -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] -; CHECK-ORDERED: ret float %[[SUM_LCSSA]] - -; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_PHI1]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_PHI2]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_PHI3]], %[[VEC_LOAD3]] -; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: for.cond.cleanup -; CHECK-UNORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01 -; CHECK-UNORDERED: store float %[[FADD_42]], ptr %b -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] -; CHECK-UNORDERED: ret float %[[SUM_LCSSA]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-NOT-VECTORIZED: for.body.preheader: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.cond.cleanup: +; CHECK-NOT-VECTORIZED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-NOT-VECTORIZED-NEXT: store float [[FADD2]], ptr [[B]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-UNORDERED: for.body.preheader: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <8 x float> [[VEC_PHI1]], [[WIDE_LOAD4]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd <8 x float> [[VEC_PHI2]], [[WIDE_LOAD5]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd <8 x float> [[VEC_PHI3]], [[WIDE_LOAD6]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP15]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP18]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-UNORDERED: for.cond.cleanup: +; CHECK-UNORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-UNORDERED-NEXT: store float [[FADD2]], ptr [[B]], align 4 +; CHECK-UNORDERED-NEXT: br label [[FOR_END]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: ret float [[SUM_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll_last_val +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED: for.body.preheader: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP12]], <8 x float> [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP13]], <8 x float> [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP14]], <8 x float> [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP17]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-ORDERED: for.cond.cleanup: +; CHECK-ORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01 +; CHECK-ORDERED-NEXT: store float [[FADD2]], ptr [[B]], align 4 +; CHECK-ORDERED-NEXT: br label [[FOR_END]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: ret float [[SUM_LCSSA]] +; + + entry: %cmp = icmp sgt i64 %n, 0 @@ -252,55 +595,162 @@ } define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_strict_interleave -; CHECK-ORDERED: entry -; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, ptr %a, i64 1 -; CHECK-ORDERED: %[[LOAD1:.*]] = load float, ptr %a -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr %[[ARRAYIDX]] -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] -; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] -; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]]) -; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: ret void - -; CHECK-UNORDERED-LABEL: @fadd_strict_interleave -; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, ptr %a, i64 1 -; CHECK-UNORDERED: %[[LOADA1:.*]] = load float, ptr %a -; CHECK-UNORDERED: %[[LOADA2:.*]] = load float, ptr %[[ARRAYIDX]] -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[INS2:.*]] = insertelement <4 x float> , float %[[LOADA2]], i32 0 -; CHECK-UNORDERED: %[[INS1:.*]] = insertelement <4 x float> , float %[[LOADA1]], i32 0 -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <4 x float> [ %[[INS2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <4 x float> [ %[[INS1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-UNORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <4 x float> %[[STRIDED1:.*]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]]) -; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}} -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD2]], {{.*}} -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[RDX1]], %middle.block ] -; CHECK-UNORDERED: %[[SUM2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX2]], %middle.block ] -; CHECK-UNORDERED: store float %[[SUM1]] -; CHECK-UNORDERED: store float %[[SUM2]] -; CHECK-UNORDERED: ret void - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define void @fadd_strict_interleave +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[A1:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]] +; CHECK-NOT-VECTORIZED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: ret void +; +; CHECK-UNORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, ptr [[A]], align 4 +; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[A2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = insertelement <4 x float> , float [[A1]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-UNORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <4 x float> [[STRIDED_VEC]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd <4 x float> [[STRIDED_VEC2]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP8]]) +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP13]], [[ADD_PHI2]] +; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP14]], [[ADD_PHI1]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 +; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: ret void +; +; CHECK-ORDERED-LABEL: define void @fadd_strict_interleave +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, ptr [[A]], align 4 +; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-ORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-ORDERED-NEXT: [[TMP6]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[STRIDED_VEC2]]) +; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI1]], <4 x float> [[STRIDED_VEC]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP9]], [[ADD_PHI2]] +; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP10]], [[ADD_PHI1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 +; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: ret void +; + + entry: %arrayidxa = getelementptr inbounds float, ptr %a, i64 1 @@ -330,42 +780,147 @@ } define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_of_sum -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr -; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, ptr -; CHECK-ORDERED: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]]) -; CHECK-ORDERED: for.end.loopexit -; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] -; CHECK-ORDERED: ret float %[[PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_of_sum -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <4 x float>, ptr -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <4 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[VEC_PHI]], %[[VEC_FADD1]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]] -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]] -; CHECK-UNORDERED: for.end.loopexit -; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ] -; CHECK-UNORDERED: ret float %[[SUM]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_of_sum +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-NOT-VECTORIZED: for.body.preheader: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NOT-VECTORIZED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end.loopexit: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RES]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_of_sum +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-UNORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-UNORDERED: for.body.preheader: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP7]] = fadd <4 x float> [[VEC_PHI]], [[TMP6]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP7]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP10]], [[TMP11]] +; CHECK-UNORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-UNORDERED: for.end.loopexit: +; CHECK-UNORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_END]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-UNORDERED-NEXT: ret float [[RES]] +; +; CHECK-ORDERED-LABEL: define float @fadd_of_sum +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01 +; CHECK-ORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-ORDERED: for.body.preheader: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP6]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP9]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-ORDERED: for.end.loopexit: +; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_END]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-ORDERED-NEXT: ret float [[RES]] +; + + entry: %arrayidx = getelementptr inbounds float, ptr %a, i64 1 @@ -392,63 +947,214 @@ } define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_conditional -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr -; CHECK-ORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer -; CHECK-ORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0 -; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue -; CHECK-ORDERED: pred.load.continue6 -; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ] -; CHECK-ORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], -; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> , <4 x float> %[[PHI1]] -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]]) -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-ORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00 -; CHECK-ORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc -; CHECK-ORDERED: if.then -; CHECK-ORDERED: %[[LOAD3:.*]] = load float, ptr -; CHECK-ORDERED: br label %for.inc -; CHECK-ORDERED: for.inc -; CHECK-ORDERED: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[RDX_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_conditional -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD:.*]], %pred.load.continue6 ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr -; CHECK-UNORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer -; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0 -; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue -; CHECK-UNORDERED: pred.load.continue6 -; CHECK-UNORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], -; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> , <4 x float> %[[PRED_PHI:.*]] -; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00 -; CHECK-UNORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc -; CHECK-UNORDERED: if.then -; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, ptr -; CHECK-UNORDERED: for.inc -; CHECK-UNORDERED: %[[PHI:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ] -; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RDX_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_conditional +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: if.then: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_INC]] +; CHECK-NOT-VECTORIZED: for.inc: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_conditional +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-UNORDERED: pred.load.if: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-UNORDERED: pred.load.continue: +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-UNORDERED: pred.load.if1: +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.continue2: +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-UNORDERED: pred.load.if3: +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]] +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-UNORDERED: pred.load.continue4: +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-UNORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK-UNORDERED: pred.load.if5: +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-UNORDERED: pred.load.continue6: +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ] +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> , <4 x float> [[TMP27]] +; CHECK-UNORDERED-NEXT: [[TMP28]] = fadd <4 x float> [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP28]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP31]], 0.000000e+00 +; CHECK-UNORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-UNORDERED: if.then: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: br label [[FOR_INC]] +; CHECK-UNORDERED: for.inc: +; CHECK-UNORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP32]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_conditional +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-ORDERED: pred.load.if: +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-ORDERED: pred.load.continue: +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-ORDERED: pred.load.if1: +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.continue2: +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-ORDERED: pred.load.if3: +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]] +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-ORDERED: pred.load.continue4: +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-ORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK-ORDERED: pred.load.if5: +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4 +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-ORDERED: pred.load.continue6: +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ] +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> , <4 x float> [[TMP27]] +; CHECK-ORDERED-NEXT: [[TMP28]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP30]], 0.000000e+00 +; CHECK-ORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-ORDERED: if.then: +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: br label [[FOR_INC]] +; CHECK-ORDERED: for.inc: +; CHECK-ORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP31]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -480,44 +1186,150 @@ ; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute define float @fadd_predicated(ptr noalias nocapture %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_predicated -; CHECK-ORDERED: vector.ph -; CHECK-ORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1 -; CHECK-ORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0 -; CHECK-ORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ] -; CHECK-ORDERED: pred.load.continue2 -; CHECK-ORDERED: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ] -; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> -; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]]) -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_predicated -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1 -; CHECK-UNORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0 -; CHECK-UNORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <2 x float> [ , %vector.ph ], [ %[[FADD:.*]], %pred.load.continue2 ] -; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]] -; CHECK-UNORDERED: pred.load.continue2 -; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}} -; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[SUM]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_predicated +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[L2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[L3:%.*]] = load float, ptr [[L2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_0_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_predicated +; CHECK-UNORDERED-SAME: (ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ , [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-UNORDERED: pred.load.if: +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-UNORDERED: pred.load.continue: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.if1: +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1 +; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-UNORDERED: pred.load.continue2: +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <2 x float> [[VEC_PHI]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP13]], <2 x float> [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP14]]) +; CHECK-UNORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-UNORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[L3:%.*]] = load float, ptr [[L2]], align 4 +; CHECK-UNORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[SUM_0_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_predicated +; CHECK-ORDERED-SAME: (ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-ORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-ORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-ORDERED: pred.load.if: +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-ORDERED: pred.load.continue: +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-ORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.if1: +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1 +; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-ORDERED: pred.load.continue2: +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP12]], <2 x float> +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI]], <2 x float> [[TMP13]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-ORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[L3:%.*]] = load float, ptr [[L2]], align 4 +; CHECK-ORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[SUM_0_LCSSA]] +; + + entry: br label %for.body @@ -539,30 +1351,96 @@ ; Negative test - loop contains multiple fadds which we cannot safely reorder define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fadd_multiple -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RET]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple +; CHECK-UNORDERED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP6]] = fadd <8 x float> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP9]] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple +; CHECK-ORDERED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -588,30 +1466,96 @@ ; Negative test - loop contains two fadds and only one fadd has the fast flag, ; which we cannot safely reorder. define float @fadd_multiple_one_flag(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] -; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RET]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple_one_flag +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple_one_flag +; CHECK-UNORDERED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP6]] = fadd fast <8 x float> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP9]] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[RDX]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple_one_flag +; CHECK-ORDERED-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[RDX]] +; + + entry: br label %for.body @@ -653,14 +1597,71 @@ ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even ; with the -hints-allow-reordering flag set to true. define float @induction_and_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, i64 %N) { -; CHECK-ORDERED-LABEL: @induction_and_reduction -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @induction_and_reduction +; CHECK-NOT-VECTORIZED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @induction_and_reduction +; CHECK-UNORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @induction_and_reduction +; CHECK-ORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @induction_and_reduction -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -685,50 +1686,142 @@ ; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions define float @fast_induction_and_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, i64 %N) { -; CHECK-ORDERED-LABEL: @fast_induction_and_reduction -; CHECK-ORDERED: vector.ph -; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] -; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr -; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] -; CHECK-ORDERED: store float %[[IND_SUM_PHI]], ptr -; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction -; CHECK-UNORDERED: vector.ph -; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ] -; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr -; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]] -; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ] -; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] -; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], ptr -; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fast_induction_and_reduction +; CHECK-NOT-VECTORIZED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi fast float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fast_induction_and_reduction +; CHECK-UNORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[DOTCAST]] +; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 +; CHECK-UNORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-UNORDERED-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP3]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP6]] = fadd <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP6]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP9]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fast_induction_and_reduction +; CHECK-ORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[DOTCAST]] +; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 +; CHECK-ORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-ORDERED-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP3]], align 4 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP6]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_014]], ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]] +; + + entry: br label %for.body @@ -755,15 +1848,83 @@ ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even ; with the -hints-allow-reordering flag set to true. define float @fast_induction_unordered_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, ptr noalias nocapture %B, i64 %N) { +; CHECK-NOT-VECTORIZED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-NOT-VECTORIZED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[X_021]], ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD6]] +; +; CHECK-UNORDERED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-UNORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: store float [[X_021]], ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-UNORDERED-NEXT: ret float [[ADD6]] +; +; CHECK-ORDERED-LABEL: define float @fast_induction_unordered_reduction +; CHECK-ORDERED-SAME: (ptr nocapture readonly [[VALUES:%.*]], float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], ptr noalias nocapture [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: store float [[X_021]], ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[VALUES]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]] +; CHECK-ORDERED-NEXT: ret float [[ADD6]] +; -; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction -; CHECK-ORDERED-NOT: vector.body - -; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -791,59 +1952,133 @@ ; Test reductions for a VF of 1 and a UF > 1. define float @fadd_scalar_vf(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_scalar_vf -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-ORDERED: %[[LOAD3:.*]] = load float, ptr -; CHECK-ORDERED: %[[LOAD4:.*]] = load float, ptr -; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]] -; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]] -; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]] -; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph -; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] -; CHECK-ORDERED: %[[LOAD5:.*]] = load float, ptr -; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] -; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ] -; CHECK-ORDERED: ret float %[[RES_PHI]] - -; CHECK-UNORDERED-LABEL: @fadd_scalar_vf -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ] -; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr -; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr -; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, ptr -; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]] -; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]] -; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]] -; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]] -; CHECK-UNORDERED: scalar.ph -; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] -; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] -; CHECK-UNORDERED: for.end -; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: ret float %[[RES_PHI]] - -; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf -; CHECK-NOT-VECTORIZED-NOT: @vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_scalar_vf +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_scalar_vf +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd float [[TMP8]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd float [[TMP9]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd float [[TMP10]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd float [[TMP11]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd float [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd float [[TMP15]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_scalar_vf +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd float [[VEC_PHI]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = fadd float [[TMP12]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = fadd float [[TMP13]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP15]] = fadd float [[TMP14]], [[TMP11]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + + entry: br label %for.body @@ -864,59 +2099,134 @@ ; Same as above but where fadd has a fast-math flag. define float @fadd_scalar_vf_fmf(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD3:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD4:%.*]] = load float, ptr -; CHECK-ORDERED: [[FADD1:%.*]] = fadd nnan float [[VEC_PHI]], [[LOAD1]] -; CHECK-ORDERED: [[FADD2:%.*]] = fadd nnan float [[FADD1]], [[LOAD2]] -; CHECK-ORDERED: [[FADD3:%.*]] = fadd nnan float [[FADD2]], [[LOAD3]] -; CHECK-ORDERED: [[FADD4]] = fadd nnan float [[FADD3]], [[LOAD4]] -; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph: -; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD4]], %middle.block ] -; CHECK-ORDERED: for.body: -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD5:%.*]] = load float, ptr -; CHECK-ORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]] -; CHECK-ORDERED: for.end: -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[FADD4]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI4:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, ptr -; CHECK-UNORDERED: [[FADD1]] = fadd nnan float [[LOAD1]], [[VEC_PHI1]] -; CHECK-UNORDERED: [[FADD2]] = fadd nnan float [[LOAD2]], [[VEC_PHI2]] -; CHECK-UNORDERED: [[FADD3]] = fadd nnan float [[LOAD3]], [[VEC_PHI3]] -; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]] -; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]] -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, ptr -; CHECK-UNORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-NOT-VECTORIZED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]] +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-UNORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd nnan float [[TMP8]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd nnan float [[TMP9]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd nnan float [[TMP10]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd nnan float [[TMP11]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan float [[TMP13]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd nnan float [[TMP14]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd nnan float [[TMP15]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP17]], [[SUM_07]] +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fadd_scalar_vf_fmf +; CHECK-ORDERED-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd nnan float [[VEC_PHI]], [[TMP8]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = fadd nnan float [[TMP12]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = fadd nnan float [[TMP13]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP15]] = fadd nnan float [[TMP14]], [[TMP11]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP17]], [[SUM_07]] +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] +; + ; CHECK-UORDERED: for.end -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[BIN_RDX3]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] -; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -937,30 +2247,100 @@ ; Test case where the reduction step is a first-order recurrence. define double @reduction_increment_by_first_order_recurrence() { -; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ] -; CHECK-ORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ] -; CHECK-ORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double> -; CHECK-ORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> -; CHECK-ORDERED: [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]]) -; CHECK-ORDERED: scalar.ph: -; CHECK-ORDERED: = phi double [ 0.000000e+00, %entry ], [ [[RED_NEXT]], %middle.block ] -; -; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[RED:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ] -; CHECK-UNORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double> -; CHECK-UNORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> -; CHECK-UNORDERED: [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]] -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]]) -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[RDX]], %middle.block ] -; -; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence( -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[LOOP:%.*]] +; CHECK-NOT-VECTORIZED: loop: +; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FOR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RED_NEXT]] = fadd double [[FOR]], [[RED]] +; CHECK-NOT-VECTORIZED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NOT-VECTORIZED: exit: +; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret double [[RES]] +; +; CHECK-UNORDERED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> +; CHECK-UNORDERED-NEXT: [[TMP2]] = fadd <4 x double> [[TMP1]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 +; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3 +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[LOOP:%.*]] +; CHECK-UNORDERED: loop: +; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-UNORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]] +; CHECK-UNORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-UNORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-UNORDERED: exit: +; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret double [[RES]] +; +; CHECK-ORDERED-LABEL: define double @reduction_increment_by_first_order_recurrence() { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> +; CHECK-ORDERED-NEXT: [[TMP2]] = call double @llvm.vector.reduce.fadd.v4f64(double [[VEC_PHI]], <4 x double> [[TMP1]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-ORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 +; CHECK-ORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3 +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[LOOP:%.*]] +; CHECK-ORDERED: loop: +; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-ORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]] +; CHECK-ORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-ORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-ORDERED: exit: +; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret double [[RES]] ; entry: br label %loop @@ -983,14 +2363,76 @@ ; We should not mark the fadd as an ordered reduction here as there are ; more than 2 uses of the instruction define float @fadd_multiple_use(i64 %n) { -; CHECK-ORDERED-LABEL: @fadd_multiple_use +; CHECK-NOT-VECTORIZED-LABEL: define float @fadd_multiple_use +; CHECK-NOT-VECTORIZED-SAME: (i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-NOT-VECTORIZED: bb1: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI2]] +; CHECK-NOT-VECTORIZED: bb2: +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI3]] +; +; CHECK-UNORDERED-LABEL: define float @fadd_multiple_use +; CHECK-UNORDERED-SAME: (i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-UNORDERED: bb1: +; CHECK-UNORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[PHI2]] +; CHECK-UNORDERED: bb2: +; CHECK-UNORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-UNORDERED-NEXT: ret float [[PHI3]] +; +; CHECK-ORDERED-LABEL: define float @fadd_multiple_use +; CHECK-ORDERED-SAME: (i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ] +; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ] +; CHECK-ORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ] +; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00 +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1 +; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] +; CHECK-ORDERED: bb1: +; CHECK-ORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[PHI2]] +; CHECK-ORDERED: bb2: +; CHECK-ORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ] +; CHECK-ORDERED-NEXT: ret float [[PHI3]] +; ; CHECK-ORDERED-LABEL-NOT: vector.body -; CHECK-UNORDERED-LABEL: @fadd_multiple_use ; CHECK-UNORDERED-LABEL-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1019,59 +2461,176 @@ ; Test case where the loop has a call to the llvm.fmuladd intrinsic. define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_strict -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr -; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]]) -; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]]) -; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]]) -; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]]) -; CHECK-ORDERED: for.body: -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr -; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ] - -; CHECK-UNORDERED-LABEL: @fmuladd_strict -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_strict +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_strict +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP16]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, ptr [[TMP17]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, ptr [[TMP18]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, ptr [[TMP19]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP21]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP22]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP23]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP21]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP22]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP23]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_strict +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP16]], align 4 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP17]], align 4 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP18]], align 4 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP19]], align 4 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[TMP20]]) +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP24]], <8 x float> [[TMP21]]) +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP25]], <8 x float> [[TMP22]]) +; CHECK-ORDERED-NEXT: [[TMP27]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP26]], <8 x float> [[TMP23]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; + + entry: br label %for.body @@ -1094,73 +2653,159 @@ ; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic. define float @fmuladd_scalar_vf(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf -; CHECK-ORDERED: vector.body: -; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ] -; CHECK-ORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD3:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD4:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD5:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD6:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD7:%.*]] = load float, ptr -; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]] -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]] -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]] -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]] -; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]] -; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]] -; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]] -; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]] -; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-ORDERED: scalar.ph -; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-ORDERED: [[LOAD8:%.*]] = load float, ptr -; CHECK-ORDERED: [[LOAD9:%.*]] = load float, ptr -; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ] -; CHECK-ORDERED: ret float [[RES]] - -; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, ptr -; CHECK-UNORDERED: [[FMULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = tail call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = tail call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: scalar.ph: -; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ] -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, ptr -; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_scalar_vf +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_scalar_vf +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP12]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP13]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP15]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP16]], float [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP21]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP17]], float [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP22]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float [[TMP18]], float [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP23]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP19]], float [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP21]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX4:%.*]] = fadd float [[TMP22]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX5:%.*]] = fadd float [[TMP23]], [[BIN_RDX4]] +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[BIN_RDX5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_scalar_vf +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP12]], align 4 +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP13]], align 4 +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP15]], align 4 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = fmul float [[TMP8]], [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = fmul float [[TMP9]], [[TMP17]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = fmul float [[TMP10]], [[TMP18]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = fmul float [[TMP11]], [[TMP19]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = fadd float [[VEC_PHI]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = fadd float [[TMP25]], [[TMP22]] +; CHECK-ORDERED-NEXT: [[TMP27]] = fadd float [[TMP26]], [[TMP23]] +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP29]], float [[TMP30]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; + + entry: br label %for.body @@ -1183,14 +2828,65 @@ ; Test case where the reduction phi is one of the mul operands of the fmuladd. define float @fmuladd_phi_is_mul_operand(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_phi_is_mul_operand +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1213,14 +2909,59 @@ ; Test case where the reduction phi is two operands of the fmuladd. define float @fmuladd_phi_is_two_operands(ptr %a, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_phi_is_two_operands +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1242,37 +2983,129 @@ ; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder ; so is only vectorized in the unordered (fast) case. define float @fmuladd_multiple(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @fmuladd_multiple -; CHECK-ORDERED-NOT: vector.body: - -; CHECK-UNORDERED-LABEL: @fmuladd_multiple -; CHECK-UNORDERED: vector.body: -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]]) -; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) -; CHECK-UNORDERED: for.body: -; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ] -; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr -; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr -; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) -; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]]) -; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ] -; CHECK-UNORDERED: ret float [[RES]] - -; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple -; CHECK-NOT-VECTORIZED-NOT: vector.body: +; CHECK-NOT-VECTORIZED-LABEL: define float @fmuladd_multiple +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @fmuladd_multiple +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP10]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP16]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, ptr [[TMP17]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, ptr [[TMP18]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, ptr [[TMP19]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP24]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[TMP20]]) +; CHECK-UNORDERED-NEXT: [[TMP25]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[TMP21]]) +; CHECK-UNORDERED-NEXT: [[TMP26]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[TMP22]]) +; CHECK-UNORDERED-NEXT: [[TMP27]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[TMP23]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP25]], [[TMP24]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP26]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP27]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]]) +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP30]], float [[TMP31]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP30]], float [[TMP31]], float [[MULADD]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @fmuladd_multiple +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; + + entry: br label %for.body @@ -1296,14 +3129,68 @@ ; Same as above but the first fmuladd is one of the mul operands of the second fmuladd. define float @multiple_fmuladds_mul_operand(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @multiple_fmuladds_mul_operand +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1327,14 +3214,68 @@ ; Same as above but the first fmuladd is two of the operands of the second fmuladd. define float @multiple_fmuladds_two_operands(ptr %a, ptr %b, i64 %n) { -; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands -; CHECK-ORDERED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-NOT-VECTORIZED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK-NOT-VECTORIZED: for.end: +; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-UNORDERED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; +; CHECK-ORDERED-LABEL: define float @multiple_fmuladds_two_operands +; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]]) +; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]]) +; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]] +; CHECK-ORDERED: for.end: +; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]] +; -; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands -; CHECK-UNORDERED-NOT: vector.body -; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands -; CHECK-NOT-VECTORIZED-NOT: vector.body entry: br label %for.body @@ -1360,38 +3301,122 @@ ; Test case with invariant store where fadd is strict. define void @reduction_store_to_invariant_address(ptr %dst, ptr readonly %src) { -; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address( -; CHECK-ORDERED: entry -; CHECK-ORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, ptr %dst, i64 42 -; CHECK-ORDERED: vector.body -; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] -; CHECK-ORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr -; CHECK-ORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD_VEC]]) -; CHECK-ORDERED: middle.block -; CHECK-ORDERED: store float %[[RDX]], ptr %[[DEST_PTR]] -; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-ORDERED: %[[FADD:.*]] = fadd float %{{.*}}, %[[LOAD]] -; CHECK-ORDERED: store float %[[FADD]], ptr %[[DEST_PTR]] - -; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address( -; CHECK-UNORDERED: entry -; CHECK-UNORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, ptr %dst, i64 42 -; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] -; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr -; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[VEC_PHI]], %[[LOAD_VEC]] -; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd -; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]]) -; CHECK-UNORDERED: store float %[[RDX]], ptr %[[DEST_PTR]] -; CHECK-UNORDERED: for.body -; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr -; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]] -; CHECK-UNORDERED: store float %[[FADD]], ptr %[[DEST_PTR]] - -; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address( -; CHECK-NOT-VECTORIZED-NOT: vector.body +; CHECK-NOT-VECTORIZED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-NOT-VECTORIZED-SAME: (ptr [[DST:%.*]], ptr readonly [[SRC:%.*]]) { +; CHECK-NOT-VECTORIZED-NEXT: entry: +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 42 +; CHECK-NOT-VECTORIZED-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT-VECTORIZED: for.body: +; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; CHECK-NOT-VECTORIZED: for.cond.cleanup: +; CHECK-NOT-VECTORIZED-NEXT: ret void +; +; CHECK-UNORDERED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-UNORDERED-SAME: (ptr [[DST:%.*]], ptr readonly [[SRC:%.*]]) { +; CHECK-UNORDERED-NEXT: entry: +; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 42 +; CHECK-UNORDERED-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-UNORDERED: vector.memcheck: +; CHECK-UNORDERED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 172 +; CHECK-UNORDERED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4000 +; CHECK-UNORDERED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARRAYIDX]], [[SCEVGEP1]] +; CHECK-UNORDERED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-UNORDERED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-UNORDERED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-UNORDERED: vector.ph: +; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[TMP0]] +; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !alias.scope !40 +; CHECK-UNORDERED-NEXT: [[TMP3]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-UNORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-UNORDERED-NEXT: store float [[TMP5]], ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MEMCHECK]] ], [ 0.000000e+00, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP6]], [[TMP7]] +; CHECK-UNORDERED-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-UNORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK-UNORDERED: for.cond.cleanup: +; CHECK-UNORDERED-NEXT: ret void +; +; CHECK-ORDERED-LABEL: define void @reduction_store_to_invariant_address +; CHECK-ORDERED-SAME: (ptr [[DST:%.*]], ptr readonly [[SRC:%.*]]) { +; CHECK-ORDERED-NEXT: entry: +; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 42 +; CHECK-ORDERED-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-ORDERED: vector.memcheck: +; CHECK-ORDERED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 172 +; CHECK-ORDERED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4000 +; CHECK-ORDERED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARRAYIDX]], [[SCEVGEP1]] +; CHECK-ORDERED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-ORDERED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-ORDERED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-ORDERED: vector.ph: +; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[TMP0]] +; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !alias.scope !36 +; CHECK-ORDERED-NEXT: [[TMP3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-ORDERED-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-ORDERED: middle.block: +; CHECK-ORDERED-NEXT: store float [[TMP3]], ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED: scalar.ph: +; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MEMCHECK]] ], [ 0.000000e+00, [[ENTRY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-ORDERED: for.body: +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP5]], [[TMP6]] +; CHECK-ORDERED-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000 +; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK-ORDERED: for.cond.cleanup: +; CHECK-ORDERED-NEXT: ret void +; + + entry: %arrayidx = getelementptr inbounds float, ptr %dst, i64 42 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -128,23 +128,23 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or i32 [[TMP7]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or i32 [[TMP8]], 1 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1 ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() @@ -307,11 +307,11 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext [[TMP11]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 @@ -477,16 +477,16 @@ ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext [[TMP6]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP10]], i32 1, [[TMP8]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext [[TMP12]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext [[TMP13]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP15]], i32 1, [[TMP12]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -297,14 +297,14 @@ ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP18:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, [[TMP14]], poison) +; CHECK-NEXT: [[TMP19:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], [[TMP19]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() @@ -369,11 +369,11 @@ ; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 -; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] +; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0 +; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP14]], poison) +; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP14]], [[WIDE_MASKED_LOAD1]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) ; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -476,8 +476,8 @@ ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) ; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -passes=loop-vectorize -scalable-vectorization=off -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s ; NOTE: These tests aren't really target-specific, but it's convenient to target AArch64 @@ -8,28 +9,50 @@ ; The original loop had an unconditional uniform load. Let's make sure ; we don't artificially create new predicated blocks for the load. define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { -; CHECK-LABEL: @uniform_load( +; CHECK-LABEL: define void @uniform_load +; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK: [[N_MINUS_VF:%.*]] = sub i64 %n, [[VSCALE_X_VF:.*]] -; CHECK: [[CMP:%.*]] = icmp ugt i64 %n, [[VSCALE_X_VF]] -; CHECK: [[N2:%.*]] = select i1 [[CMP]], i64 [[N_MINUS_VF]], i64 0 -; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, ptr %src, align 4 -; CHECK-NOT: load i32, ptr %src, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]]) -; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], -; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0 -; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -52,20 +75,80 @@ ; However, we at least ensure the mask is the overlap of the loop predicate ; and the original condition. define void @cond_uniform_load(ptr nocapture %dst, ptr nocapture readonly %src, ptr nocapture readonly %cond, i64 %n) #0 { -; CHECK-LABEL: @cond_uniform_load( +; CHECK-LABEL: define void @cond_uniform_load +; CHECK-SAME: (ptr nocapture [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr nocapture readonly [[COND:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) -; CHECK: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr %src, i64 0 -; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[N]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[SRC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 -; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison) +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope !4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 4, <4 x i1> [[TMP9]], <4 x i32> poison), !alias.scope !7 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr [[TMP13]], i32 4, <4 x i1> [[TMP12]]), !alias.scope !9, !noalias !11 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX6]], i64 [[TMP3]]) +; CHECK-NEXT: [[INDEX_NEXT7]] = add i64 [[INDEX6]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP17]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -37,11 +37,11 @@ ; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 ; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) -; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] +; VLENUNK-NEXT: [[TMP12:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] +; VLENUNK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 +; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, [[TMP11]], poison) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], zeroinitializer, [[WIDE_MASKED_LOAD]] ; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] ; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \ ; RUN: < %s | FileCheck %s ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 \ @@ -6,61 +7,109 @@ target triple = "riscv64" define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { -; CHECK-LABEL: @select_icmp +; CHECK-LABEL: define i32 @select_icmp +; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP7]], [[X]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; -; SCALABLE-LABEL: @select_icmp +; SCALABLE-LABEL: define i32 @select_icmp +; SCALABLE-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP8]] = select [[TMP7]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], zeroinitializer -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP8]], zeroinitializer +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], [[X]] +; SCALABLE-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE: for.end: +; SCALABLE-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[COND_LCSSA]] ; entry: br label %for.body @@ -81,61 +130,109 @@ } define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { -; CHECK-LABEL: @select_fcmp +; CHECK-LABEL: define i32 @select_fcmp +; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP7]], [[X]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; -; SCALABLE-LABEL: @select_fcmp +; SCALABLE-LABEL: define i32 @select_fcmp +; SCALABLE-SAME: (float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[X:%.*]], i64 0 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[X]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP7:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP8]] = select [[TMP7]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], zeroinitializer -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP8]], zeroinitializer +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP13]], [[X]] +; SCALABLE-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE: for.end: +; SCALABLE-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[COND_LCSSA]] ; entry: br label %for.body @@ -156,53 +253,101 @@ } define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_i32_from_icmp +; CHECK-LABEL: define i32 @select_const_i32_from_icmp +; CHECK-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 7, i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_i32_from_icmp +; SCALABLE-LABEL: define i32 @select_const_i32_from_icmp +; SCALABLE-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP8]] = select [[TMP7]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP8]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 7, i32 3 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP13:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP13]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 7 +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP13]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -223,65 +368,113 @@ } define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { -; CHECK-LABEL: @select_i32_from_icmp +; CHECK-LABEL: define i32 @select_i32_from_icmp +; CHECK-SAME: (ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 ; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]] +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[B]], i32 [[A]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 [[B]] +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_i32_from_icmp +; SCALABLE-LABEL: define i32 @select_i32_from_icmp +; SCALABLE-SAME: (ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[A:%.*]], i64 0 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 ; SCALABLE-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[B]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP8]] = select [[TMP7]], [[VEC_PHI]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 ; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]] +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP8]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[B]], i32 [[A]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP13:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP13]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 [[B]] +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP13]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -302,53 +495,101 @@ } define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_i32_from_fcmp +; CHECK-LABEL: define i32 @select_const_i32_from_fcmp +; CHECK-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 1, i32 2 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_i32_from_fcmp +; SCALABLE-LABEL: define i32 @select_const_i32_from_fcmp +; SCALABLE-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast ueq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[TMP7:%.*]] = fcmp fast ueq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP8]] = select [[TMP7]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP8]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 1, i32 2 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP13:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP13]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = fcmp fast ueq float [[TMP16]], 3.000000e+00 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 1 +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP13]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -369,11 +610,41 @@ } define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_f32_from_icmp -; CHECK-NOT: vector.body +; CHECK-LABEL: define float @select_const_f32_from_icmp +; CHECK-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00 +; CHECK-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]] +; CHECK-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ] +; CHECK-NEXT: ret float [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_f32_from_icmp -; SCALABLE-NOT: vector.body +; SCALABLE-LABEL: define float @select_const_f32_from_icmp +; SCALABLE-SAME: (ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] +; SCALABLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3 +; SCALABLE-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00 +; SCALABLE-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ] +; SCALABLE-NEXT: ret float [[DOTLCSSA]] ; entry: br label %for.body @@ -394,65 +665,129 @@ } define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 { -; CHECK-LABEL: @pred_select_const_i32_from_icmp +; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp +; CHECK-SAME: (ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP3]], <4 x i32> poison) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[PREDPHI]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 1, i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i32 1, i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 35 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP12]], 2 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[R_1_LCSSA]] ; -; SCALABLE-LABEL: @pred_select_const_i32_from_icmp +; SCALABLE-LABEL: define i32 @pred_select_const_i32_from_icmp +; SCALABLE-SAME: (ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 35, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 35, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP8:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 -; SCALABLE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, [[TMP8]], poison) -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP13:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), [[VEC_PHI]] -; SCALABLE-NEXT: [[TMP14:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP13]], [[VEC_PHI]] -; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, [[TMP7]], poison) +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP12:%.*]] = select [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), [[VEC_PHI]] +; SCALABLE-NEXT: [[PREDPHI]] = select [[TMP7]], [[TMP12]], [[VEC_PHI]] +; SCALABLE-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[PREDPHI]], zeroinitializer -; SCALABLE-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP16]], i32 1, i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; SCALABLE-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; SCALABLE-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP17]], 35 +; SCALABLE-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALABLE: if.then: +; SCALABLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; SCALABLE-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; SCALABLE-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP18]], 2 +; SCALABLE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; SCALABLE-NEXT: br label [[FOR_INC]] +; SCALABLE: for.inc: +; SCALABLE-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; SCALABLE-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE: for.end.loopexit: +; SCALABLE-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[R_1_LCSSA]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s ; Make sure that integer poison-generating flags (i.e., nuw/nsw, exact and inbounds) @@ -19,18 +20,55 @@ ; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load. ; Test for PR52111. define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, - ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @drop_scalar_nuw_nsw( +; CHECK-LABEL: define void @drop_scalar_nuw_nsw +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[INPUT:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP2]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[I27]] +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; + ptr %output) local_unnamed_addr #0 { entry: br label %loop.header @@ -60,18 +98,55 @@ ; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load. ; In this case, 'sub' and 'getelementptr' are not guarded by the predicate. define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input, - ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @drop_nonpred_scalar_nuw_nsw( +; CHECK-LABEL: define void @drop_nonpred_scalar_nuw_nsw +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP5:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[INPUT:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I27:%.*]] = sub i64 [[IV]], 1 +; CHECK-NEXT: [[I29:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I27]] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; + ptr %output) local_unnamed_addr #0 { entry: br label %loop.header @@ -100,18 +175,56 @@ ; Preserve poison-generating flags from vector 'sub', 'mul' and 'getelementptr' feeding a masked gather. define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, - ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @preserve_vector_nuw_nsw( +; CHECK-LABEL: define void @preserve_vector_nuw_nsw +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i64> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[INPUT:%.*]], <4 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP7]], i32 4, <4 x i1> [[TMP8]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> [[TMP4]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> [[TMP2]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[I28:%.*]] = mul nuw nsw i64 [[I27]], 2 +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[I28]] +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; + ptr %output) local_unnamed_addr #0 { entry: br label %loop.header @@ -141,20 +254,61 @@ ; Drop poison-generating flags from vector 'sub' and 'gep' feeding a masked load. define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, - ptr %output, ptr noalias %ptrs) local_unnamed_addr #0 { -; CHECK-LABEL: @drop_vector_nuw_nsw( +; CHECK-LABEL: define void @drop_vector_nuw_nsw +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]], ptr noalias [[PTRS:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[PTRS:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[INPUT:%.*]], <4 x i64> [[TMP6]] -; CHECK: [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[PTRS]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[INPUT]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[TMP4]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[TMP6]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds ptr, ptr [[PTRS]], i64 [[IV]] +; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[I27]] +; CHECK-NEXT: store ptr [[I29]], ptr [[GEP]], align 8 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; + ptr %output, ptr noalias %ptrs) local_unnamed_addr #0 { entry: br label %loop.header @@ -186,18 +340,49 @@ ; Preserve poison-generating flags from 'sub', which is not contributing to any address computation ; of any masked load/store/gather/scatter. define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @preserve_nuw_nsw_no_addr( +; CHECK-LABEL: define void @preserve_nuw_nsw_no_addr +; CHECK-SAME: (ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store i64 [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; entry: br label %loop.header @@ -224,21 +409,61 @@ ; Drop poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked load. define void @drop_scalar_exact(ptr noalias nocapture readonly %input, - ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @drop_scalar_exact( +; CHECK-LABEL: define void @drop_scalar_exact +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[INPUT:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP11]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = sdiv i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[TMP5]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I7:%.*]] = icmp ne i64 [[IV]], 0 +; CHECK-NEXT: [[I8:%.*]] = and i64 [[IV]], 1 +; CHECK-NEXT: [[I9:%.*]] = icmp eq i64 [[I8]], 0 +; CHECK-NEXT: [[I10:%.*]] = and i1 [[I7]], [[I9]] +; CHECK-NEXT: br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I26:%.*]] = sdiv exact i64 [[IV]], 1 +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[I26]] +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; + ptr %output) local_unnamed_addr #0 { entry: br label %loop.header @@ -270,21 +495,60 @@ ; Preserve poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked gather. define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input, - ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @preserve_vector_exact_no_addr( +; CHECK-LABEL: define void @preserve_vector_exact_no_addr +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[INPUT:%.*]], <4 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP9]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> [[TMP6]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP7]], i32 4, <4 x i1> [[TMP5]], <4 x float> poison), !invariant.load !0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I7:%.*]] = icmp ne i64 [[IV]], 0 +; CHECK-NEXT: [[I8:%.*]] = and i64 [[IV]], 1 +; CHECK-NEXT: [[I9:%.*]] = icmp eq i64 [[I8]], 0 +; CHECK-NEXT: [[I10:%.*]] = and i1 [[I7]], [[I9]] +; CHECK-NEXT: br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I26:%.*]] = sdiv exact i64 [[IV]], 2 +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[I26]] +; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void ; + ptr %output) local_unnamed_addr #0 { entry: br label %loop.header @@ -317,18 +581,49 @@ ; Preserve poison-generating flags from 'sdiv', which is not contributing to any address computation ; of any masked load/store/gather/scatter. define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 { -; CHECK-LABEL: @preserve_exact_no_addr( +; CHECK-LABEL: define void @preserve_exact_no_addr +; CHECK-SAME: (ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I27:%.*]] = sdiv exact i64 [[IV]], 2 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[IV]] +; CHECK-NEXT: store i64 [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; entry: br label %loop.header @@ -356,7 +651,8 @@ ; Make sure we don't vectorize a loop with a phi feeding a poison value to ; a masked load/gather. define void @dont_vectorize_poison_phi(ptr noalias nocapture readonly %input, -; CHECK-LABEL: @dont_vectorize_poison_phi( +; CHECK-LABEL: define void @dont_vectorize_poison_phi +; CHECK-SAME: (ptr noalias nocapture readonly [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: @@ -365,12 +661,12 @@ ; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT:%.*]], i64 [[POISON]] +; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, ptr [[INPUT]], i64 [[POISON]] ; CHECK-NEXT: [[I30:%.*]] = load float, ptr [[I29]], align 4, !invariant.load !0 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ] -; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT:%.*]], i64 [[IV]] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[IV]] ; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 ; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -78,9 +78,9 @@ ; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0 ; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 ; SSE-NEXT: [[TMP3:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], -; SSE-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; SSE-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], -; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP4]], <2 x double> [[VEC_PHI]] +; SSE-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; SSE-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]] ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; SSE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; SSE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -108,7 +108,7 @@ ; SSE-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] ; SSE-NEXT: [[I_NEXT]] = add i32 [[I]], 1 ; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 -; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] +; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] ; SSE: done: ; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] @@ -126,9 +126,9 @@ ; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0 ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 ; AVX-NEXT: [[TMP3:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP4:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; AVX-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], -; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] +; AVX-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; AVX-NEXT: [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]] ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; AVX-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; AVX-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -156,7 +156,7 @@ ; AVX-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] ; AVX-NEXT: [[I_NEXT]] = add i32 [[I]], 1 ; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 -; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX: done: ; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -42,22 +42,22 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], -; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i1> [[TMP6]], -; CHECK-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP12]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP12]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer @@ -203,22 +203,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP75]], align 4 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer @@ -368,46 +368,46 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP65:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3 -; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1 -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3 -; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 -; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 -; CHECK-NEXT: [[TMP96:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP97:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP98:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP99:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP71]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP69:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP70:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> poison, i32 [[TMP68]], i32 0 +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP72]], i32 [[TMP69]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP70]], i32 2 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP74]], i32 [[TMP71]], i32 3 +; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP77:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP79:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i32> poison, i32 [[TMP76]], i32 0 +; CHECK-NEXT: [[TMP81:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP77]], i32 1 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP81]], i32 [[TMP78]], i32 2 +; CHECK-NEXT: [[TMP83:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP79]], i32 3 +; CHECK-NEXT: [[TMP84:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP85:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i32> poison, i32 [[TMP84]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP85]], i32 1 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP89]], i32 [[TMP86]], i32 2 +; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP87]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP93:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP94:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP95:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> poison, i32 [[TMP92]], i32 0 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP93]], i32 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x i32> [[TMP97]], i32 [[TMP94]], i32 2 +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP95]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP75]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP100]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP101]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] ; CHECK-NEXT: [[TMP102]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] @@ -548,158 +548,158 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> poison, i32 [[TMP70]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; CHECK-NEXT: [[TMP72:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP71]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 +; CHECK-NEXT: br i1 [[TMP73]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] ; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP72]], i32 [[TMP75]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] ; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] +; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP72]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ] +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 +; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] ; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP80]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] ; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; CHECK-NEXT: [[TMP82:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP81]], [[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 +; CHECK-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] ; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 +; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP85:%.*]] = load i32, ptr [[TMP84]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP85]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] ; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK-NEXT: [[TMP87:%.*]] = phi <4 x i32> [ [[TMP82]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP86]], [[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 +; CHECK-NEXT: br i1 [[TMP88]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] ; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> poison, i32 [[TMP90]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] ; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK-NEXT: [[TMP92:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP91]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 +; CHECK-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] ; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP94]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP95]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] ; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK-NEXT: [[TMP97:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP96]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 +; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] ; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP100:%.*]] = load i32, ptr [[TMP99]], align 4 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP97]], i32 [[TMP100]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] ; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK-NEXT: [[TMP102:%.*]] = phi <4 x i32> [ [[TMP97]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP101]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 +; CHECK-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] ; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 +; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP104]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP105]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] ; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK-NEXT: [[TMP107:%.*]] = phi <4 x i32> [ [[TMP102]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP106]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 +; CHECK-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] ; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP109]], align 4 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> poison, i32 [[TMP110]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] ; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP111]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: [[TMP113:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 +; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] ; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 +; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4 +; CHECK-NEXT: [[TMP116:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP115]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] ; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK-NEXT: [[TMP117:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP116]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: [[TMP118:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 +; CHECK-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] ; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP119]], align 4 +; CHECK-NEXT: [[TMP121:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP120]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] ; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK-NEXT: [[TMP122:%.*]] = phi <4 x i32> [ [[TMP117]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP121]], [[PRED_LOAD_IF22]] ] +; CHECK-NEXT: [[TMP123:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 +; CHECK-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] ; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 +; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP124]], align 4 +; CHECK-NEXT: [[TMP126:%.*]] = insertelement <4 x i32> [[TMP122]], i32 [[TMP125]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] ; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK-NEXT: [[TMP127:%.*]] = phi <4 x i32> [ [[TMP122]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP126]], [[PRED_LOAD_IF24]] ] +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 +; CHECK-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] ; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 +; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP130:%.*]] = load i32, ptr [[TMP129]], align 4 +; CHECK-NEXT: [[TMP131:%.*]] = insertelement <4 x i32> poison, i32 [[TMP130]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] ; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK-NEXT: [[TMP132:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP131]], [[PRED_LOAD_IF26]] ] +; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 +; CHECK-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] ; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP135:%.*]] = load i32, ptr [[TMP134]], align 4 +; CHECK-NEXT: [[TMP136:%.*]] = insertelement <4 x i32> [[TMP132]], i32 [[TMP135]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] ; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK-NEXT: [[TMP137:%.*]] = phi <4 x i32> [ [[TMP132]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP136]], [[PRED_LOAD_IF28]] ] +; CHECK-NEXT: [[TMP138:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 +; CHECK-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 +; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP140:%.*]] = load i32, ptr [[TMP139]], align 4 +; CHECK-NEXT: [[TMP141:%.*]] = insertelement <4 x i32> [[TMP137]], i32 [[TMP140]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK-NEXT: [[TMP142:%.*]] = phi <4 x i32> [ [[TMP137]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP141]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP143:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 +; CHECK-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 -; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP145:%.*]] = load i32, ptr [[TMP144]], align 4 +; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x i32> [[TMP142]], i32 [[TMP145]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP142]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP146]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP107]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP127]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] ; CHECK-NEXT: [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] @@ -847,22 +847,22 @@ ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP40]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP48]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP56]], +; CHECK-NEXT: [[TMP68:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP69]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP69]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP69]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP69]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -1014,22 +1014,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP75]], align 4 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer @@ -1227,62 +1227,62 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4 -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 -; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 -; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 -; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 -; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 -; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 -; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 -; CHECK-NEXT: [[TMP112:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP113:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP114:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP115:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP84:%.*]] = load i32, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[TMP85:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = load i32, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i32> poison, i32 [[TMP84]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP85]], i32 1 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP89]], i32 [[TMP86]], i32 2 +; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP87]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP94:%.*]] = load i32, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> poison, i32 [[TMP92]], i32 0 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP93]], i32 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x i32> [[TMP97]], i32 [[TMP94]], i32 2 +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP95]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = load i32, ptr [[TMP76]], align 4 +; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP77]], align 4 +; CHECK-NEXT: [[TMP102:%.*]] = load i32, ptr [[TMP78]], align 4 +; CHECK-NEXT: [[TMP103:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> poison, i32 [[TMP100]], i32 0 +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x i32> [[TMP104]], i32 [[TMP101]], i32 1 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP102]], i32 2 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> [[TMP106]], i32 [[TMP103]], i32 3 +; CHECK-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP80]], align 4 +; CHECK-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP81]], align 4 +; CHECK-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP82]], align 4 +; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP83]], align 4 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> poison, i32 [[TMP108]], i32 0 +; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP109]], i32 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP110]], i32 2 +; CHECK-NEXT: [[TMP115:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP111]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP107]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP115]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP116]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP117]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] ; CHECK-NEXT: [[TMP118]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] @@ -1424,22 +1424,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -1585,22 +1585,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -1746,22 +1746,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -1916,22 +1916,22 @@ ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP40]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP48]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP56]], +; CHECK-NEXT: [[TMP68:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP69]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP69]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP69]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP69]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer @@ -2084,22 +2084,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -2246,22 +2246,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -2418,22 +2418,22 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP68]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP68]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP73]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[TMP68]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP74]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP68]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer @@ -2537,20 +2537,20 @@ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP15]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP27]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP28]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP29]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 @@ -2636,20 +2636,20 @@ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP15]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP27]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP28]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP29]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 @@ -2735,20 +2735,20 @@ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP15]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP27]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP28]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP29]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 @@ -2834,44 +2834,44 @@ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP15]], i32 1 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP24]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP25:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP24]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP15]], i32 2 -; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP15]], i32 2 +; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP28]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP29]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP30:%.*]] = phi <4 x i32> [ [[TMP25]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP29]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP15]], i32 3 -; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP15]], i32 3 +; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP34]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP15]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP35]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP36]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 52 @@ -3025,62 +3025,62 @@ ; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i1> [[TMP76]], i1 [[TMP73]], i32 1 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i1> [[TMP77]], i1 [[TMP74]], i32 2 ; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i1> [[TMP78]], i1 [[TMP75]], i32 3 -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP27]] -; CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP28]] -; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP29]] -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP30]] -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP80]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP81]], align 4 -; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP82]], align 4 -; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP83]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 -; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP84]], align 4 -; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP85]], align 4 -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP86]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP87]], align 4 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 -; CHECK-NEXT: [[TMP112:%.*]] = load i32, ptr [[TMP88]], align 4 -; CHECK-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP89]], align 4 -; CHECK-NEXT: [[TMP114:%.*]] = load i32, ptr [[TMP90]], align 4 -; CHECK-NEXT: [[TMP115:%.*]] = load i32, ptr [[TMP91]], align 4 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <4 x i32> poison, i32 [[TMP112]], i32 0 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP116]], i32 [[TMP113]], i32 1 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP114]], i32 2 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP115]], i32 3 -; CHECK-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP92]], align 4 -; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP93]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP94]], align 4 -; CHECK-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP124:%.*]] = insertelement <4 x i32> poison, i32 [[TMP120]], i32 0 -; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> [[TMP124]], i32 [[TMP121]], i32 1 -; CHECK-NEXT: [[TMP126:%.*]] = insertelement <4 x i32> [[TMP125]], i32 [[TMP122]], i32 2 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP123]], i32 3 -; CHECK-NEXT: [[TMP128:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP129:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[TMP130:%.*]] = xor <4 x i1> [[TMP71]], -; CHECK-NEXT: [[TMP131:%.*]] = xor <4 x i1> [[TMP79]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP71]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP79]], <4 x i32> [[TMP127]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP81:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP82:%.*]] = xor <4 x i1> [[TMP71]], +; CHECK-NEXT: [[TMP83:%.*]] = xor <4 x i1> [[TMP79]], +; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP26]] +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP100:%.*]] = load i32, ptr [[TMP84]], align 4 +; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP85]], align 4 +; CHECK-NEXT: [[TMP102:%.*]] = load i32, ptr [[TMP86]], align 4 +; CHECK-NEXT: [[TMP103:%.*]] = load i32, ptr [[TMP87]], align 4 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> poison, i32 [[TMP100]], i32 0 +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x i32> [[TMP104]], i32 [[TMP101]], i32 1 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP102]], i32 2 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> [[TMP106]], i32 [[TMP103]], i32 3 +; CHECK-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP88]], align 4 +; CHECK-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP89]], align 4 +; CHECK-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP90]], align 4 +; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP91]], align 4 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> poison, i32 [[TMP108]], i32 0 +; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP109]], i32 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP110]], i32 2 +; CHECK-NEXT: [[TMP115:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP111]], i32 3 +; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP92]], align 4 +; CHECK-NEXT: [[TMP117:%.*]] = load i32, ptr [[TMP93]], align 4 +; CHECK-NEXT: [[TMP118:%.*]] = load i32, ptr [[TMP94]], align 4 +; CHECK-NEXT: [[TMP119:%.*]] = load i32, ptr [[TMP95]], align 4 +; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> poison, i32 [[TMP116]], i32 0 +; CHECK-NEXT: [[TMP121:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP117]], i32 1 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP121]], i32 [[TMP118]], i32 2 +; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP122]], i32 [[TMP119]], i32 3 +; CHECK-NEXT: [[TMP124:%.*]] = load i32, ptr [[TMP96]], align 4 +; CHECK-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP97]], align 4 +; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP98]], align 4 +; CHECK-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP99]], align 4 +; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> poison, i32 [[TMP124]], i32 0 +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP125]], i32 1 +; CHECK-NEXT: [[TMP130:%.*]] = insertelement <4 x i32> [[TMP129]], i32 [[TMP126]], i32 2 +; CHECK-NEXT: [[TMP131:%.*]] = insertelement <4 x i32> [[TMP130]], i32 [[TMP127]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP107]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP115]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP71]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP79]], <4 x i32> [[TMP131]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP132]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP133]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] ; CHECK-NEXT: [[TMP134]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -963,14 +963,14 @@ ; AVX-NEXT: entry: ; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX: vector.memcheck: -; AVX-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 -; AVX-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 -; AVX-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 -; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; AVX-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1064,14 +1064,14 @@ ; AVX512-NEXT: entry: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1230,14 +1230,14 @@ ; AVX512-NEXT: entry: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1358,14 +1358,14 @@ ; AVX2-NEXT: entry: ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 -; AVX2-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 -; AVX2-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] -; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]] ; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1483,14 +1483,14 @@ ; AVX512-NEXT: entry: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1678,46 +1678,46 @@ ; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX1-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -1790,46 +1790,46 @@ ; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX2-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] @@ -1902,46 +1902,46 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer ; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer ; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], -; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], -; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], -; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP21:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP23:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP20]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP21]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP22]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP23]], <8 x ptr> poison) ; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], -; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], -; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], -; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], -; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) +; AVX512-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP37:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP38:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP39:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP40:%.*]] = select <8 x i1> [[TMP20]], <8 x i1> [[TMP36]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = select <8 x i1> [[TMP21]], <8 x i1> [[TMP37]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = select <8 x i1> [[TMP22]], <8 x i1> [[TMP38]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = select <8 x i1> [[TMP23]], <8 x i1> [[TMP39]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP40]]) +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP42]]) +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP43]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -2059,46 +2059,46 @@ ; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX1-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] @@ -2171,46 +2171,46 @@ ; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX2-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] @@ -2283,46 +2283,46 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer ; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer ; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], -; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], -; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], -; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP21:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP23:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP20]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP21]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP22]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP23]], <8 x ptr> poison) ; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], -; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], -; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], -; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], -; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) +; AVX512-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP37:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP38:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP39:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP40:%.*]] = select <8 x i1> [[TMP20]], <8 x i1> [[TMP36]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = select <8 x i1> [[TMP21]], <8 x i1> [[TMP37]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = select <8 x i1> [[TMP22]], <8 x i1> [[TMP38]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = select <8 x i1> [[TMP23]], <8 x i1> [[TMP39]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP40]]) +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP42]]) +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP43]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -401,82 +401,82 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP7]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.load.continue: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if3: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; DISABLED_MASKED_STRIDED: pred.load.continue4: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if5: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; DISABLED_MASKED_STRIDED: pred.load.continue6: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if7: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; DISABLED_MASKED_STRIDED: pred.load.continue8: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if9: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; DISABLED_MASKED_STRIDED: pred.load.continue10: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if11: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; DISABLED_MASKED_STRIDED: pred.load.continue12: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if13: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; DISABLED_MASKED_STRIDED: pred.load.continue14: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] ; DISABLED_MASKED_STRIDED: pred.load.if15: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 @@ -484,7 +484,7 @@ ; DISABLED_MASKED_STRIDED: pred.load.continue16: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] -; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]]) +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP2]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -511,15 +511,15 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[TMP5]], <16 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP4]], i32 1, <16 x i1> [[TMP5]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]]) +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP2]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -605,82 +605,82 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP7]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.load.continue: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if3: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; DISABLED_MASKED_STRIDED: pred.load.continue4: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if5: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; DISABLED_MASKED_STRIDED: pred.load.continue6: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if7: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; DISABLED_MASKED_STRIDED: pred.load.continue8: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if9: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; DISABLED_MASKED_STRIDED: pred.load.continue10: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if11: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; DISABLED_MASKED_STRIDED: pred.load.continue12: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if13: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; DISABLED_MASKED_STRIDED: pred.load.continue14: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] ; DISABLED_MASKED_STRIDED: pred.load.if15: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 @@ -688,7 +688,7 @@ ; DISABLED_MASKED_STRIDED: pred.load.continue16: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] -; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]]) +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP2]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -715,15 +715,15 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul i32 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <24 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr [[TMP3]], i32 1, <24 x i1> [[TMP5]], <24 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr [[TMP4]], i32 1, <24 x i1> [[TMP5]], <24 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]]) +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP2]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -2214,90 +2214,90 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE62]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP7]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.load.continue: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if3: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; DISABLED_MASKED_STRIDED: pred.load.continue4: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if5: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; DISABLED_MASKED_STRIDED: pred.load.continue6: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if7: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; DISABLED_MASKED_STRIDED: pred.load.continue8: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if9: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; DISABLED_MASKED_STRIDED: pred.load.continue10: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if11: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; DISABLED_MASKED_STRIDED: pred.load.continue12: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if13: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; DISABLED_MASKED_STRIDED: pred.load.continue14: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if15: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; DISABLED_MASKED_STRIDED: pred.load.continue16: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = or <8 x i32> [[TMP2]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = or <8 x i32> [[TMP3]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP53]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if17: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0 @@ -2307,7 +2307,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; DISABLED_MASKED_STRIDED: pred.load.continue18: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = phi <8 x i8> [ poison, [[PRED_LOAD_CONTINUE16]] ], [ [[TMP57]], [[PRED_LOAD_IF17]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if19: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1 @@ -2317,7 +2317,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; DISABLED_MASKED_STRIDED: pred.load.continue20: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = phi <8 x i8> [ [[TMP58]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP63]], [[PRED_LOAD_IF19]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if21: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2 @@ -2327,7 +2327,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; DISABLED_MASKED_STRIDED: pred.load.continue22: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = phi <8 x i8> [ [[TMP64]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP69]], [[PRED_LOAD_IF21]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if23: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3 @@ -2337,7 +2337,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; DISABLED_MASKED_STRIDED: pred.load.continue24: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP76:%.*]] = phi <8 x i8> [ [[TMP70]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP75]], [[PRED_LOAD_IF23]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if25: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP78:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4 @@ -2347,7 +2347,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; DISABLED_MASKED_STRIDED: pred.load.continue26: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP82:%.*]] = phi <8 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP81]], [[PRED_LOAD_IF25]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP83:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP83:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if27: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP84:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5 @@ -2357,7 +2357,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; DISABLED_MASKED_STRIDED: pred.load.continue28: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP88:%.*]] = phi <8 x i8> [ [[TMP82]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP87]], [[PRED_LOAD_IF27]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP89:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP89:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if29: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP90:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6 @@ -2367,7 +2367,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE30]] ; DISABLED_MASKED_STRIDED: pred.load.continue30: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP94:%.*]] = phi <8 x i8> [ [[TMP88]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP93]], [[PRED_LOAD_IF29]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP95:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP95:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP95]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if31: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP96:%.*]] = extractelement <8 x i32> [[TMP52]], i64 7 @@ -2378,80 +2378,80 @@ ; DISABLED_MASKED_STRIDED: pred.load.continue32: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP100:%.*]] = phi <8 x i8> [ [[TMP94]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP99]], [[PRED_LOAD_IF31]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP101:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP51]], <8 x i8> [[TMP100]]) -; DISABLED_MASKED_STRIDED-NEXT: [[TMP102:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP102:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = extractelement <8 x i32> [[TMP2]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP103]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i8> [[TMP101]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP105]], ptr [[TMP104]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.store.continue: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP106]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if33: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP107]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i8> [[TMP101]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP109]], ptr [[TMP108]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] ; DISABLED_MASKED_STRIDED: pred.store.continue34: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP110]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if35: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP111]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i8> [[TMP101]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP113]], ptr [[TMP112]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] ; DISABLED_MASKED_STRIDED: pred.store.continue36: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP114]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if37: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP115]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[TMP101]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP117]], ptr [[TMP116]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] ; DISABLED_MASKED_STRIDED: pred.store.continue38: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP118]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if39: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP119]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i8> [[TMP101]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP121]], ptr [[TMP120]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] ; DISABLED_MASKED_STRIDED: pred.store.continue40: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP122]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if41: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP123]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i8> [[TMP101]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP125]], ptr [[TMP124]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] ; DISABLED_MASKED_STRIDED: pred.store.continue42: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP126]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if43: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP127]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i8> [[TMP101]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP129]], ptr [[TMP128]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] ; DISABLED_MASKED_STRIDED: pred.store.continue44: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP130]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if45: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[Q]], i32 [[TMP131]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i8> [[TMP101]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP133]], ptr [[TMP132]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] ; DISABLED_MASKED_STRIDED: pred.store.continue46: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = sub <8 x i8> zeroinitializer, [[TMP101]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i1> [[TMP2]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP135]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if47: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0 @@ -2460,7 +2460,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP138]], ptr [[TMP137]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] ; DISABLED_MASKED_STRIDED: pred.store.continue48: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i1> [[TMP2]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP139]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if49: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1 @@ -2469,7 +2469,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP142]], ptr [[TMP141]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] ; DISABLED_MASKED_STRIDED: pred.store.continue50: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP143]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if51: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2 @@ -2478,7 +2478,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], ptr [[TMP145]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] ; DISABLED_MASKED_STRIDED: pred.store.continue52: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i1> [[TMP2]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP147]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if53: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3 @@ -2487,7 +2487,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP150]], ptr [[TMP149]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] ; DISABLED_MASKED_STRIDED: pred.store.continue54: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i1> [[TMP2]], i64 4 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP151]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if55: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4 @@ -2496,7 +2496,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], ptr [[TMP153]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] ; DISABLED_MASKED_STRIDED: pred.store.continue56: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i1> [[TMP2]], i64 5 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP155]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if57: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5 @@ -2505,7 +2505,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP158]], ptr [[TMP157]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] ; DISABLED_MASKED_STRIDED: pred.store.continue58: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i1> [[TMP2]], i64 6 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP159]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if59: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6 @@ -2514,7 +2514,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], ptr [[TMP161]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] ; DISABLED_MASKED_STRIDED: pred.store.continue60: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i1> [[TMP2]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP163]], label [[PRED_STORE_IF61:%.*]], label [[PRED_STORE_CONTINUE62]] ; DISABLED_MASKED_STRIDED: pred.store.if61: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = extractelement <8 x i32> [[TMP52]], i64 7 @@ -2548,14 +2548,14 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP4]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or i32 [[TMP2]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or i32 [[TMP3]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]]) ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP5]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -22,30 +22,30 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP5]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i32 [[TMP7]], [[X:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]] ; CHECK: pred.sdiv.if1: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = sdiv i32 [[TMP12]], [[X]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE2]] ; CHECK: pred.sdiv.continue2: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[TMP14]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i32> [[TMP15]], [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -71,7 +71,7 @@ ; CHECK-NEXT: [[T7]] = add i32 [[R]], [[T6]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 -; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T8]] @@ -90,84 +90,84 @@ ; SINK-GATHER-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; SINK-GATHER-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; SINK-GATHER-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 -; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] -; SINK-GATHER-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 -; SINK-GATHER-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison) -; SINK-GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 -; SINK-GATHER-NEXT: br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; SINK-GATHER-NEXT: [[TMP3:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], +; SINK-GATHER-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; SINK-GATHER-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; SINK-GATHER-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison) +; SINK-GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 +; SINK-GATHER-NEXT: br i1 [[TMP6]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; SINK-GATHER: pred.sdiv.if: -; SINK-GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0 -; SINK-GATHER-NEXT: [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]] -; SINK-GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i32 0 +; SINK-GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0 +; SINK-GATHER-NEXT: [[TMP8:%.*]] = sdiv i32 [[TMP7]], [[X:%.*]] +; SINK-GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[TMP8]], i32 0 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE]] ; SINK-GATHER: pred.sdiv.continue: -; SINK-GATHER-NEXT: [[TMP9:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ] -; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 -; SINK-GATHER-NEXT: br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] +; SINK-GATHER-NEXT: [[TMP10:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_SDIV_IF]] ] +; SINK-GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 +; SINK-GATHER-NEXT: br i1 [[TMP11]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] ; SINK-GATHER: pred.sdiv.if1: -; SINK-GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1 -; SINK-GATHER-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]] -; SINK-GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP12]], i32 1 +; SINK-GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1 +; SINK-GATHER-NEXT: [[TMP13:%.*]] = sdiv i32 [[TMP12]], [[X]] +; SINK-GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP13]], i32 1 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE2]] ; SINK-GATHER: pred.sdiv.continue2: -; SINK-GATHER-NEXT: [[TMP14:%.*]] = phi <8 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ] -; SINK-GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 -; SINK-GATHER-NEXT: br i1 [[TMP15]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] +; SINK-GATHER-NEXT: [[TMP15:%.*]] = phi <8 x i32> [ [[TMP10]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_SDIV_IF1]] ] +; SINK-GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 +; SINK-GATHER-NEXT: br i1 [[TMP16]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] ; SINK-GATHER: pred.sdiv.if3: -; SINK-GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2 -; SINK-GATHER-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP16]], [[X]] -; SINK-GATHER-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP17]], i32 2 +; SINK-GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2 +; SINK-GATHER-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP17]], [[X]] +; SINK-GATHER-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP18]], i32 2 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE4]] ; SINK-GATHER: pred.sdiv.continue4: -; SINK-GATHER-NEXT: [[TMP19:%.*]] = phi <8 x i32> [ [[TMP14]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP18]], [[PRED_SDIV_IF3]] ] -; SINK-GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 -; SINK-GATHER-NEXT: br i1 [[TMP20]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] +; SINK-GATHER-NEXT: [[TMP20:%.*]] = phi <8 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP19]], [[PRED_SDIV_IF3]] ] +; SINK-GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 +; SINK-GATHER-NEXT: br i1 [[TMP21]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] ; SINK-GATHER: pred.sdiv.if5: -; SINK-GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3 -; SINK-GATHER-NEXT: [[TMP22:%.*]] = sdiv i32 [[TMP21]], [[X]] -; SINK-GATHER-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP22]], i32 3 +; SINK-GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3 +; SINK-GATHER-NEXT: [[TMP23:%.*]] = sdiv i32 [[TMP22]], [[X]] +; SINK-GATHER-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP23]], i32 3 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE6]] ; SINK-GATHER: pred.sdiv.continue6: -; SINK-GATHER-NEXT: [[TMP24:%.*]] = phi <8 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP23]], [[PRED_SDIV_IF5]] ] -; SINK-GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 -; SINK-GATHER-NEXT: br i1 [[TMP25]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] +; SINK-GATHER-NEXT: [[TMP25:%.*]] = phi <8 x i32> [ [[TMP20]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP24]], [[PRED_SDIV_IF5]] ] +; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 +; SINK-GATHER-NEXT: br i1 [[TMP26]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] ; SINK-GATHER: pred.sdiv.if7: -; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4 -; SINK-GATHER-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[X]] -; SINK-GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP27]], i32 4 +; SINK-GATHER-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4 +; SINK-GATHER-NEXT: [[TMP28:%.*]] = sdiv i32 [[TMP27]], [[X]] +; SINK-GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP28]], i32 4 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE8]] ; SINK-GATHER: pred.sdiv.continue8: -; SINK-GATHER-NEXT: [[TMP29:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ] -; SINK-GATHER-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 -; SINK-GATHER-NEXT: br i1 [[TMP30]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] +; SINK-GATHER-NEXT: [[TMP30:%.*]] = phi <8 x i32> [ [[TMP25]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP29]], [[PRED_SDIV_IF7]] ] +; SINK-GATHER-NEXT: [[TMP31:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 +; SINK-GATHER-NEXT: br i1 [[TMP31]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] ; SINK-GATHER: pred.sdiv.if9: -; SINK-GATHER-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5 -; SINK-GATHER-NEXT: [[TMP32:%.*]] = sdiv i32 [[TMP31]], [[X]] -; SINK-GATHER-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP32]], i32 5 +; SINK-GATHER-NEXT: [[TMP32:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5 +; SINK-GATHER-NEXT: [[TMP33:%.*]] = sdiv i32 [[TMP32]], [[X]] +; SINK-GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP33]], i32 5 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE10]] ; SINK-GATHER: pred.sdiv.continue10: -; SINK-GATHER-NEXT: [[TMP34:%.*]] = phi <8 x i32> [ [[TMP29]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP33]], [[PRED_SDIV_IF9]] ] -; SINK-GATHER-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 -; SINK-GATHER-NEXT: br i1 [[TMP35]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] +; SINK-GATHER-NEXT: [[TMP35:%.*]] = phi <8 x i32> [ [[TMP30]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP34]], [[PRED_SDIV_IF9]] ] +; SINK-GATHER-NEXT: [[TMP36:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 +; SINK-GATHER-NEXT: br i1 [[TMP36]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] ; SINK-GATHER: pred.sdiv.if11: -; SINK-GATHER-NEXT: [[TMP36:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6 -; SINK-GATHER-NEXT: [[TMP37:%.*]] = sdiv i32 [[TMP36]], [[X]] -; SINK-GATHER-NEXT: [[TMP38:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP37]], i32 6 +; SINK-GATHER-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6 +; SINK-GATHER-NEXT: [[TMP38:%.*]] = sdiv i32 [[TMP37]], [[X]] +; SINK-GATHER-NEXT: [[TMP39:%.*]] = insertelement <8 x i32> [[TMP35]], i32 [[TMP38]], i32 6 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE12]] ; SINK-GATHER: pred.sdiv.continue12: -; SINK-GATHER-NEXT: [[TMP39:%.*]] = phi <8 x i32> [ [[TMP34]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP38]], [[PRED_SDIV_IF11]] ] -; SINK-GATHER-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 -; SINK-GATHER-NEXT: br i1 [[TMP40]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]] +; SINK-GATHER-NEXT: [[TMP40:%.*]] = phi <8 x i32> [ [[TMP35]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP39]], [[PRED_SDIV_IF11]] ] +; SINK-GATHER-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 +; SINK-GATHER-NEXT: br i1 [[TMP41]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]] ; SINK-GATHER: pred.sdiv.if13: -; SINK-GATHER-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7 -; SINK-GATHER-NEXT: [[TMP42:%.*]] = sdiv i32 [[TMP41]], [[X]] -; SINK-GATHER-NEXT: [[TMP43:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP42]], i32 7 +; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7 +; SINK-GATHER-NEXT: [[TMP43:%.*]] = sdiv i32 [[TMP42]], [[X]] +; SINK-GATHER-NEXT: [[TMP44:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP43]], i32 7 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE14]] ; SINK-GATHER: pred.sdiv.continue14: -; SINK-GATHER-NEXT: [[TMP44:%.*]] = phi <8 x i32> [ [[TMP39]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP43]], [[PRED_SDIV_IF13]] ] -; SINK-GATHER-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[TMP44]], [[WIDE_LOAD]] -; SINK-GATHER-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], -; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]] +; SINK-GATHER-NEXT: [[TMP45:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP44]], [[PRED_SDIV_IF13]] ] +; SINK-GATHER-NEXT: [[TMP46:%.*]] = add nsw <8 x i32> [[TMP45]], [[WIDE_LOAD]] +; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP46]], <8 x i32> [[WIDE_LOAD]] ; SINK-GATHER-NEXT: [[TMP47]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] ; SINK-GATHER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SINK-GATHER-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -197,7 +197,7 @@ ; SINK-GATHER-NEXT: [[T7]] = add i32 [[R]], [[T6]] ; SINK-GATHER-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; SINK-GATHER-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 -; SINK-GATHER-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; SINK-GATHER-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SINK-GATHER: for.end: ; SINK-GATHER-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: ret i32 [[T8]] @@ -253,32 +253,32 @@ ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = mul <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[X]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] ; CHECK: pred.udiv.continue: -; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[X]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: -; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP16:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], @@ -330,104 +330,104 @@ ; SINK-GATHER-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE14]] ] ; SINK-GATHER-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[PRED_UDIV_CONTINUE14]] ] ; SINK-GATHER-NEXT: [[TMP0:%.*]] = mul <8 x i64> [[VEC_IND]], -; SINK-GATHER-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 -; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; SINK-GATHER-NEXT: [[TMP1:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], +; SINK-GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 +; SINK-GATHER-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; SINK-GATHER: pred.udiv.if: -; SINK-GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 -; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] -; SINK-GATHER-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -; SINK-GATHER-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]] -; SINK-GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 0 +; SINK-GATHER-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 +; SINK-GATHER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; SINK-GATHER-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; SINK-GATHER-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[X]] +; SINK-GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE]] ; SINK-GATHER: pred.udiv.continue: -; SINK-GATHER-NEXT: [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] -; SINK-GATHER-NEXT: [[TMP8:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] -; SINK-GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 -; SINK-GATHER-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]] +; SINK-GATHER-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] +; SINK-GATHER-NEXT: [[TMP9:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 +; SINK-GATHER-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]] ; SINK-GATHER: pred.udiv.if1: -; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 -; SINK-GATHER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; SINK-GATHER-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 -; SINK-GATHER-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] -; SINK-GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; SINK-GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 +; SINK-GATHER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; SINK-GATHER-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; SINK-GATHER-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[X]] +; SINK-GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP14]], i32 1 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; SINK-GATHER: pred.udiv.continue2: -; SINK-GATHER-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ] -; SINK-GATHER-NEXT: [[TMP16:%.*]] = phi <8 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF1]] ] -; SINK-GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 -; SINK-GATHER-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] +; SINK-GATHER-NEXT: [[TMP16:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF1]] ] +; SINK-GATHER-NEXT: [[TMP17:%.*]] = phi <8 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] +; SINK-GATHER-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 +; SINK-GATHER-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; SINK-GATHER: pred.udiv.if3: -; SINK-GATHER-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 -; SINK-GATHER-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] -; SINK-GATHER-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 -; SINK-GATHER-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[X]] -; SINK-GATHER-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2 +; SINK-GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 +; SINK-GATHER-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] +; SINK-GATHER-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 +; SINK-GATHER-NEXT: [[TMP22:%.*]] = udiv i32 [[TMP21]], [[X]] +; SINK-GATHER-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP22]], i32 2 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; SINK-GATHER: pred.udiv.continue4: -; SINK-GATHER-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE2]] ], [ [[TMP20]], [[PRED_UDIV_IF3]] ] -; SINK-GATHER-NEXT: [[TMP24:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP22]], [[PRED_UDIV_IF3]] ] -; SINK-GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 -; SINK-GATHER-NEXT: br i1 [[TMP25]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; SINK-GATHER-NEXT: [[TMP24:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE2]] ], [ [[TMP21]], [[PRED_UDIV_IF3]] ] +; SINK-GATHER-NEXT: [[TMP25:%.*]] = phi <8 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP23]], [[PRED_UDIV_IF3]] ] +; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 +; SINK-GATHER-NEXT: br i1 [[TMP26]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] ; SINK-GATHER: pred.udiv.if5: -; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 -; SINK-GATHER-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] -; SINK-GATHER-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 -; SINK-GATHER-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[X]] -; SINK-GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3 +; SINK-GATHER-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 +; SINK-GATHER-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP27]] +; SINK-GATHER-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 +; SINK-GATHER-NEXT: [[TMP30:%.*]] = udiv i32 [[TMP29]], [[X]] +; SINK-GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP30]], i32 3 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE6]] ; SINK-GATHER: pred.udiv.continue6: -; SINK-GATHER-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP28]], [[PRED_UDIV_IF5]] ] -; SINK-GATHER-NEXT: [[TMP32:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP30]], [[PRED_UDIV_IF5]] ] -; SINK-GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 -; SINK-GATHER-NEXT: br i1 [[TMP33]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; SINK-GATHER-NEXT: [[TMP32:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP29]], [[PRED_UDIV_IF5]] ] +; SINK-GATHER-NEXT: [[TMP33:%.*]] = phi <8 x i32> [ [[TMP25]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP31]], [[PRED_UDIV_IF5]] ] +; SINK-GATHER-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 +; SINK-GATHER-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] ; SINK-GATHER: pred.udiv.if7: -; SINK-GATHER-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 -; SINK-GATHER-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] -; SINK-GATHER-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 -; SINK-GATHER-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP36]], [[X]] -; SINK-GATHER-NEXT: [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4 +; SINK-GATHER-NEXT: [[TMP35:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 +; SINK-GATHER-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP35]] +; SINK-GATHER-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4 +; SINK-GATHER-NEXT: [[TMP38:%.*]] = udiv i32 [[TMP37]], [[X]] +; SINK-GATHER-NEXT: [[TMP39:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP38]], i32 4 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; SINK-GATHER: pred.udiv.continue8: -; SINK-GATHER-NEXT: [[TMP39:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE6]] ], [ [[TMP36]], [[PRED_UDIV_IF7]] ] -; SINK-GATHER-NEXT: [[TMP40:%.*]] = phi <8 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP38]], [[PRED_UDIV_IF7]] ] -; SINK-GATHER-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 -; SINK-GATHER-NEXT: br i1 [[TMP41]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] +; SINK-GATHER-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE6]] ], [ [[TMP37]], [[PRED_UDIV_IF7]] ] +; SINK-GATHER-NEXT: [[TMP41:%.*]] = phi <8 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP39]], [[PRED_UDIV_IF7]] ] +; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 +; SINK-GATHER-NEXT: br i1 [[TMP42]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] ; SINK-GATHER: pred.udiv.if9: -; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 -; SINK-GATHER-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] -; SINK-GATHER-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 -; SINK-GATHER-NEXT: [[TMP45:%.*]] = udiv i32 [[TMP44]], [[X]] -; SINK-GATHER-NEXT: [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5 +; SINK-GATHER-NEXT: [[TMP43:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 +; SINK-GATHER-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP43]] +; SINK-GATHER-NEXT: [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4 +; SINK-GATHER-NEXT: [[TMP46:%.*]] = udiv i32 [[TMP45]], [[X]] +; SINK-GATHER-NEXT: [[TMP47:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP46]], i32 5 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE10]] ; SINK-GATHER: pred.udiv.continue10: -; SINK-GATHER-NEXT: [[TMP47:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP44]], [[PRED_UDIV_IF9]] ] -; SINK-GATHER-NEXT: [[TMP48:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP46]], [[PRED_UDIV_IF9]] ] -; SINK-GATHER-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 -; SINK-GATHER-NEXT: br i1 [[TMP49]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] +; SINK-GATHER-NEXT: [[TMP48:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP45]], [[PRED_UDIV_IF9]] ] +; SINK-GATHER-NEXT: [[TMP49:%.*]] = phi <8 x i32> [ [[TMP41]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP47]], [[PRED_UDIV_IF9]] ] +; SINK-GATHER-NEXT: [[TMP50:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 +; SINK-GATHER-NEXT: br i1 [[TMP50]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] ; SINK-GATHER: pred.udiv.if11: -; SINK-GATHER-NEXT: [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 -; SINK-GATHER-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]] -; SINK-GATHER-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 -; SINK-GATHER-NEXT: [[TMP53:%.*]] = udiv i32 [[TMP52]], [[X]] -; SINK-GATHER-NEXT: [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6 +; SINK-GATHER-NEXT: [[TMP51:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 +; SINK-GATHER-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP51]] +; SINK-GATHER-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP52]], align 4 +; SINK-GATHER-NEXT: [[TMP54:%.*]] = udiv i32 [[TMP53]], [[X]] +; SINK-GATHER-NEXT: [[TMP55:%.*]] = insertelement <8 x i32> [[TMP49]], i32 [[TMP54]], i32 6 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE12]] ; SINK-GATHER: pred.udiv.continue12: -; SINK-GATHER-NEXT: [[TMP55:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP52]], [[PRED_UDIV_IF11]] ] -; SINK-GATHER-NEXT: [[TMP56:%.*]] = phi <8 x i32> [ [[TMP48]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP54]], [[PRED_UDIV_IF11]] ] -; SINK-GATHER-NEXT: [[TMP57:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 -; SINK-GATHER-NEXT: br i1 [[TMP57]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]] +; SINK-GATHER-NEXT: [[TMP56:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP53]], [[PRED_UDIV_IF11]] ] +; SINK-GATHER-NEXT: [[TMP57:%.*]] = phi <8 x i32> [ [[TMP49]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP55]], [[PRED_UDIV_IF11]] ] +; SINK-GATHER-NEXT: [[TMP58:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 +; SINK-GATHER-NEXT: br i1 [[TMP58]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]] ; SINK-GATHER: pred.udiv.if13: -; SINK-GATHER-NEXT: [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 -; SINK-GATHER-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] -; SINK-GATHER-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 -; SINK-GATHER-NEXT: [[TMP61:%.*]] = udiv i32 [[TMP60]], [[X]] -; SINK-GATHER-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7 +; SINK-GATHER-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 +; SINK-GATHER-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP59]] +; SINK-GATHER-NEXT: [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4 +; SINK-GATHER-NEXT: [[TMP62:%.*]] = udiv i32 [[TMP61]], [[X]] +; SINK-GATHER-NEXT: [[TMP63:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP62]], i32 7 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE14]] ; SINK-GATHER: pred.udiv.continue14: -; SINK-GATHER-NEXT: [[TMP63:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE12]] ], [ [[TMP60]], [[PRED_UDIV_IF13]] ] -; SINK-GATHER-NEXT: [[TMP64:%.*]] = phi <8 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP62]], [[PRED_UDIV_IF13]] ] -; SINK-GATHER-NEXT: [[TMP65:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], -; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]] +; SINK-GATHER-NEXT: [[TMP64:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE12]] ], [ [[TMP61]], [[PRED_UDIV_IF13]] ] +; SINK-GATHER-NEXT: [[TMP65:%.*]] = phi <8 x i32> [ [[TMP57]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP63]], [[PRED_UDIV_IF13]] ] +; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP65]], <8 x i32> [[BROADCAST_SPLAT16]] ; SINK-GATHER-NEXT: [[TMP66]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] ; SINK-GATHER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SINK-GATHER-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -17,10 +17,10 @@ ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 4 -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -34,13 +34,13 @@ ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope !3 ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP7]], [[TMP12]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP10]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP8]], +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI]] ; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -58,61 +58,61 @@ ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD24]], ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i32> [[WIDE_LOAD25]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <2 x i1> [[TMP13]], +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; CHECK: pred.urem.if: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = srem i32 [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> poison, i32 [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = urem i32 [[TMP27]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = udiv i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = srem i32 [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = urem i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP30]], i32 0 ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]] ; CHECK: pred.urem.continue: -; CHECK-NEXT: [[TMP31:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP34:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 -; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_UREM_IF26:%.*]], label [[PRED_UREM_CONTINUE27]] +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP23]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP27]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP31]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_UREM_IF26:%.*]], label [[PRED_UREM_CONTINUE27]] ; CHECK: pred.urem.if26: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = sdiv i32 [[TMP36]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP38]], i32 1 -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 1 -; CHECK-NEXT: [[TMP42:%.*]] = udiv i32 [[TMP40]], [[TMP41]] -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = srem i32 [[TMP44]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP46]], i32 1 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 1 -; CHECK-NEXT: [[TMP50:%.*]] = urem i32 [[TMP48]], [[TMP49]] -; CHECK-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TMP50]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP39:%.*]] = sdiv i32 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP39]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = udiv i32 [[TMP41]], [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP43]], i32 1 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 1 +; CHECK-NEXT: [[TMP47:%.*]] = srem i32 [[TMP45]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TMP47]], i32 1 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 1 +; CHECK-NEXT: [[TMP51:%.*]] = urem i32 [[TMP49]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP51]], i32 1 ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE27]] ; CHECK: pred.urem.continue27: -; CHECK-NEXT: [[TMP52:%.*]] = phi <2 x i32> [ [[TMP31]], [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF26]] ] -; CHECK-NEXT: [[TMP53:%.*]] = phi <2 x i32> [ [[TMP32]], [[PRED_UREM_CONTINUE]] ], [ [[TMP43]], [[PRED_UREM_IF26]] ] -; CHECK-NEXT: [[TMP54:%.*]] = phi <2 x i32> [ [[TMP33]], [[PRED_UREM_CONTINUE]] ], [ [[TMP47]], [[PRED_UREM_IF26]] ] -; CHECK-NEXT: [[TMP55:%.*]] = phi <2 x i32> [ [[TMP34]], [[PRED_UREM_CONTINUE]] ], [ [[TMP51]], [[PRED_UREM_IF26]] ] -; CHECK-NEXT: [[TMP56:%.*]] = xor <2 x i1> [[TMP13]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP9]], <2 x i32> [[TMP52]] -; CHECK-NEXT: [[PREDPHI28:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP10]], <2 x i32> [[TMP53]] -; CHECK-NEXT: [[PREDPHI29:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP11]], <2 x i32> [[TMP54]] -; CHECK-NEXT: [[PREDPHI30:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP12]], <2 x i32> [[TMP55]] +; CHECK-NEXT: [[TMP53:%.*]] = phi <2 x i32> [ [[TMP32]], [[PRED_UREM_CONTINUE]] ], [ [[TMP40]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP54:%.*]] = phi <2 x i32> [ [[TMP33]], [[PRED_UREM_CONTINUE]] ], [ [[TMP44]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP55:%.*]] = phi <2 x i32> [ [[TMP34]], [[PRED_UREM_CONTINUE]] ], [ [[TMP48]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP56:%.*]] = phi <2 x i32> [ [[TMP35]], [[PRED_UREM_CONTINUE]] ], [ [[TMP52]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP9]], <2 x i32> [[TMP53]] +; CHECK-NEXT: [[PREDPHI28:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP10]], <2 x i32> [[TMP54]] +; CHECK-NEXT: [[PREDPHI29:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP11]], <2 x i32> [[TMP55]] +; CHECK-NEXT: [[PREDPHI30:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP12]], <2 x i32> [[TMP56]] ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP57]], align 4, !alias.scope !5, !noalias !8 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 @@ -231,40 +231,40 @@ ; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = add nsw i32 [[TMP17]], 26 ; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = icmp slt i32 [[TMP10]], 100 ; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = icmp slt i32 [[TMP11]], 100 +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = xor i1 [[TMP26]], true +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = xor i1 [[TMP27]], true ; UNROLL-NO-VF-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.urem.if: -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = sdiv i32 [[TMP18]], [[TMP10]] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP20]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = srem i32 [[TMP22]], [[TMP14]] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = urem i32 [[TMP24]], [[TMP16]] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = sdiv i32 [[TMP18]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = udiv i32 [[TMP20]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = srem i32 [[TMP22]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = urem i32 [[TMP24]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE]] ; UNROLL-NO-VF: pred.urem.continue: -; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_UREM_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP29]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP35:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP31]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP32]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP33]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: br i1 [[TMP27]], label [[PRED_UREM_IF23:%.*]], label [[PRED_UREM_CONTINUE24]] ; UNROLL-NO-VF: pred.urem.if23: -; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = sdiv i32 [[TMP19]], [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP21]], [[TMP13]] -; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = srem i32 [[TMP23]], [[TMP15]] -; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = urem i32 [[TMP25]], [[TMP17]] +; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = sdiv i32 [[TMP19]], [[TMP11]] +; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = udiv i32 [[TMP21]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = srem i32 [[TMP23]], [[TMP15]] +; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = urem i32 [[TMP25]], [[TMP17]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE24]] ; UNROLL-NO-VF: pred.urem.continue24: -; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF23]] ] -; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF23]] ] ; UNROLL-NO-VF-NEXT: [[TMP42:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP38]], [[PRED_UREM_IF23]] ] ; UNROLL-NO-VF-NEXT: [[TMP43:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF23]] ] -; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = xor i1 [[TMP26]], true -; UNROLL-NO-VF-NEXT: [[TMP45:%.*]] = xor i1 [[TMP27]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP44]], i32 [[TMP18]], i32 [[TMP32]] -; UNROLL-NO-VF-NEXT: [[PREDPHI25:%.*]] = select i1 [[TMP45]], i32 [[TMP19]], i32 [[TMP40]] -; UNROLL-NO-VF-NEXT: [[PREDPHI26:%.*]] = select i1 [[TMP44]], i32 [[TMP20]], i32 [[TMP33]] -; UNROLL-NO-VF-NEXT: [[PREDPHI27:%.*]] = select i1 [[TMP45]], i32 [[TMP21]], i32 [[TMP41]] -; UNROLL-NO-VF-NEXT: [[PREDPHI28:%.*]] = select i1 [[TMP44]], i32 [[TMP22]], i32 [[TMP34]] -; UNROLL-NO-VF-NEXT: [[PREDPHI29:%.*]] = select i1 [[TMP45]], i32 [[TMP23]], i32 [[TMP42]] -; UNROLL-NO-VF-NEXT: [[PREDPHI30:%.*]] = select i1 [[TMP44]], i32 [[TMP24]], i32 [[TMP35]] -; UNROLL-NO-VF-NEXT: [[PREDPHI31:%.*]] = select i1 [[TMP45]], i32 [[TMP25]], i32 [[TMP43]] +; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP40]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[TMP45:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP41]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP28]], i32 [[TMP18]], i32 [[TMP34]] +; UNROLL-NO-VF-NEXT: [[PREDPHI25:%.*]] = select i1 [[TMP29]], i32 [[TMP19]], i32 [[TMP42]] +; UNROLL-NO-VF-NEXT: [[PREDPHI26:%.*]] = select i1 [[TMP28]], i32 [[TMP20]], i32 [[TMP35]] +; UNROLL-NO-VF-NEXT: [[PREDPHI27:%.*]] = select i1 [[TMP29]], i32 [[TMP21]], i32 [[TMP43]] +; UNROLL-NO-VF-NEXT: [[PREDPHI28:%.*]] = select i1 [[TMP28]], i32 [[TMP22]], i32 [[TMP36]] +; UNROLL-NO-VF-NEXT: [[PREDPHI29:%.*]] = select i1 [[TMP29]], i32 [[TMP23]], i32 [[TMP44]] +; UNROLL-NO-VF-NEXT: [[PREDPHI30:%.*]] = select i1 [[TMP28]], i32 [[TMP24]], i32 [[TMP37]] +; UNROLL-NO-VF-NEXT: [[PREDPHI31:%.*]] = select i1 [[TMP29]], i32 [[TMP25]], i32 [[TMP45]] ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !5, !noalias !8 ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI25]], ptr [[TMP3]], align 4, !alias.scope !5, !noalias !8 ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI26]], ptr [[TMP4]], align 4, !alias.scope !12, !noalias !13 @@ -384,34 +384,34 @@ ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !23 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = sdiv i32 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = sdiv i32 [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] ; CHECK: pred.sdiv.if3: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP21]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = sdiv i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = sdiv i32 [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP22]], i32 1 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] ; CHECK: pred.sdiv.continue4: -; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF3]] ] -; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP22]], [[PRED_SDIV_IF3]] ] -; CHECK-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP6]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[TMP5]], <2 x i32> [[TMP24]] +; CHECK-NEXT: [[TMP24:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP20]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP23]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP5]], <2 x i32> [[TMP25]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4, !alias.scope !20, !noalias !23 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -473,26 +473,26 @@ ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP5]], 23 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp slt i32 [[TMP4]], 100 ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP5]], 100 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP12]], true +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true ; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP10]], [[TMP4]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = sdiv i32 [[TMP8]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP10]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP8]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_SDIV_IF]] ] ; UNROLL-NO-VF-NEXT: br i1 [[TMP13]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.sdiv.if2: -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP11]], [[TMP5]] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP9]], [[TMP18]] +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = sdiv i32 [[TMP11]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP9]], [[TMP20]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.sdiv.continue3: -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_SDIV_IF2]] ] -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF2]] ] -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP12]], true -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = xor i1 [[TMP13]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP22]], i32 [[TMP10]], i32 [[TMP17]] -; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP23]], i32 [[TMP11]], i32 [[TMP21]] +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP20]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP21]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP14]], i32 [[TMP10]], i32 [[TMP19]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP15]], i32 [[TMP11]], i32 [[TMP23]] ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !20, !noalias !23 ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !20, !noalias !23 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -576,39 +576,39 @@ ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !32 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], , !dbg [[DBG34:![0-9]+]] -; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer, !dbg [[DBG35:![0-9]+]] -; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i1> [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], , !dbg [[DBG34:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer, !dbg [[DBG35:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP8]], , !dbg [[DBG35]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer, !dbg [[DBG35]] +; CHECK-NEXT: [[TMP12:%.*]] = or <2 x i1> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] +; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] ; CHECK: pred.sdiv.if3: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = sdiv i32 [[TMP21]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 -; CHECK-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[TMP25]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP27]], i32 1 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] ; CHECK: pred.sdiv.continue4: -; CHECK-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP23]], [[PRED_SDIV_IF3]] ] -; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF3]] ] -; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], , !dbg [[DBG35]] -; CHECK-NEXT: [[TMP30:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer, !dbg [[DBG35]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP5]], <2 x i32> [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP28]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP5]], <2 x i32> [[TMP30]] ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope !29, !noalias !32 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -672,36 +672,36 @@ ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP5]], 23 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp slt i32 [[TMP4]], 100 ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP5]], 100 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp sge i32 [[TMP4]], 200 -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP5]], 200 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true, !dbg [[DBG34:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = select i1 [[TMP16]], i1 [[TMP14]], i1 false, !dbg [[DBG35:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i1 [[TMP15]], i1 false, !dbg [[DBG35]] -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = or i1 [[TMP19]], [[TMP13]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP20]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP12]], true, !dbg [[DBG34:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP4]], 200, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP5]], 200, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = select i1 [[TMP14]], i1 [[TMP16]], i1 false, !dbg [[DBG35:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = select i1 [[TMP15]], i1 [[TMP17]], i1 false, !dbg [[DBG35]] +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = xor i1 [[TMP16]], true, !dbg [[DBG35]] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = xor i1 [[TMP17]], true, !dbg [[DBG35]] +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = select i1 [[TMP14]], i1 [[TMP20]], i1 false, !dbg [[DBG35]] +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = select i1 [[TMP15]], i1 [[TMP21]], i1 false, !dbg [[DBG35]] +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = or i1 [[TMP18]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = or i1 [[TMP19]], [[TMP13]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP24]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = sdiv i32 [[TMP10]], [[TMP4]] -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = sdiv i32 [[TMP8]], [[TMP22]] +; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP10]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP8]], [[TMP26]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP23]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP21]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]] +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP27]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP25]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.sdiv.if2: -; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP11]], [[TMP5]] -; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP9]], [[TMP26]] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = sdiv i32 [[TMP11]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = sdiv i32 [[TMP9]], [[TMP30]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.sdiv.continue3: -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF2]] ] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP27]], [[PRED_SDIV_IF2]] ] -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = xor i1 [[TMP14]], true, !dbg [[DBG35]] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = xor i1 [[TMP15]], true, !dbg [[DBG35]] -; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = select i1 [[TMP16]], i1 [[TMP30]], i1 false, !dbg [[DBG35]] -; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = select i1 [[TMP17]], i1 [[TMP31]], i1 false, !dbg [[DBG35]] -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP32]], i32 [[TMP10]], i32 [[TMP25]] -; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP33]], i32 [[TMP11]], i32 [[TMP29]] +; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP30]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP31]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP22]], i32 [[TMP10]], i32 [[TMP29]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP23]], i32 [[TMP11]], i32 [[TMP33]] ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !29, !noalias !32 ; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !29, !noalias !32 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -790,30 +790,30 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[X:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] ; CHECK: pred.udiv.continue: -; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP11]], [[X]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], [[X]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = udiv i32 [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP15]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -865,24 +865,24 @@ ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 ; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 -; UNROLL-NO-VF-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = xor i1 [[C:%.*]], true +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = xor i1 [[C]], true +; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP4]], [[TMP6]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = udiv i32 [[TMP4]], [[TMP8]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ] ; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.if2: -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], [[X]] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP5]], [[TMP9]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP5]], [[X]] +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP5]], [[TMP11]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.continue3: -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF2]] ] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP8]], i32 [[TMP4]] -; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[C]], i32 [[TMP11]], i32 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP10]], i32 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[C]], i32 [[TMP13]], i32 [[TMP5]] ; UNROLL-NO-VF-NEXT: [[TMP14]] = add i32 [[VEC_PHI]], [[PREDPHI]] ; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI1]], [[PREDPHI4]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -219,18 +219,18 @@ ; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]] ; UNROLL-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 ; UNROLL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4 -; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] +; UNROLL-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2:%.*]], true +; UNROLL-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.if: ; UNROLL-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 ; UNROLL-NEXT: store i32 [[TMP10]], ptr [[TMP8]], align 4 ; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.continue3: -; UNROLL-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NEXT: [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1 -; UNROLL-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]] -; UNROLL-NEXT: [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]] +; UNROLL-NEXT: [[TMP13:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NEXT: [[TMP14:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP13]] +; UNROLL-NEXT: [[PREDPHI4]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP14]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -291,7 +291,9 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP5]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = xor i1 [[COND_2:%.*]], true +; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: ; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP8]], ptr [[TMP6]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] @@ -301,12 +303,10 @@ ; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]] -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP13:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP12]] +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI4]] = select i1 [[TMP11]], i32 [[VEC_PHI1]], i32 [[TMP13]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -368,26 +368,26 @@ ; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP6]] ; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 -; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP9:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]] -; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 +; VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]] +; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if1: -; VEC-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VEC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP13]] -; VEC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 +; VEC-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1 +; VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP14]] +; VEC-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: store i32 [[TMP16]], ptr [[TMP15]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: -; VEC-NEXT: [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], -; VEC-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]] +; VEC-NEXT: [[TMP17:%.*]] = add <2 x i32> [[VEC_PHI]], +; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP9]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP17]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -606,13 +606,13 @@ ; CHECK-LABEL: @fcmp_multi( ; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[V0]], %[[V0]], %[[V0]], %[[C1]], %[[V0]], %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer +; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], %[[V0]], %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]] ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]] ; CHECK: fadd fast <4 x float> %[[S2]], @@ -674,13 +674,13 @@ ; CHECK-LABEL: @fcmp_fadd_fsub( ; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[V0]], -; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> ; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], %[[V0]], %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer +; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float> +; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> ; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]] ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonly { diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1981,27 +1981,27 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] ; CHECK: pred.udiv.continue: -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = udiv i32 [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -2048,26 +2048,26 @@ ; IND-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 +; IND-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], +; IND-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <2 x i32> zeroinitializer ; IND-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; IND: pred.udiv.if: -; IND-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 -; IND-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP2]], [[INDEX]] -; IND-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 +; IND-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 +; IND-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[INDEX]] +; IND-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; IND-NEXT: br label [[PRED_UDIV_CONTINUE]] ; IND: pred.udiv.continue: -; IND-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] +; IND-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] ; IND-NEXT: br i1 [[C]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; IND: pred.udiv.if1: -; IND-NEXT: [[TMP6:%.*]] = or i32 [[INDEX]], 1 -; IND-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 -; IND-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]] -; IND-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP8]], i64 1 +; IND-NEXT: [[TMP8:%.*]] = or i32 [[INDEX]], 1 +; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 +; IND-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP8]] +; IND-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP10]], i64 1 ; IND-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; IND: pred.udiv.continue2: -; IND-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ] -; IND-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], -; IND-NEXT: [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]] +; IND-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF1]] ] +; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]] ; IND-NEXT: [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -2213,48 +2213,48 @@ ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.udiv.if: -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-IC: pred.udiv.continue: -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; UNROLL-NO-IC: pred.udiv.if3: -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP14]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP14]] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP16]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL-NO-IC: pred.udiv.continue4: -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF3]] ] +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] ; UNROLL-NO-IC: pred.udiv.if5: -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]] ; UNROLL-NO-IC: pred.udiv.continue6: -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF5]] ] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF5]] ] +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]] ; UNROLL-NO-IC: pred.udiv.if7: -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = add i32 [[INDEX]], 3 -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 [[TMP24]], [[TMP23]] -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP25]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[INDEX]], 3 +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP26]], [[TMP25]] +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP27]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; UNROLL-NO-IC: pred.udiv.continue8: -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP26]], [[PRED_UDIV_IF7]] ] -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP16]] -; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP29]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP27]] +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_UDIV_IF7]] ] +; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]] +; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP29]] ; UNROLL-NO-IC-NEXT: [[TMP30]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP31]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -23,27 +23,27 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP15]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP16]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 @@ -122,27 +122,27 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[START]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[START]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[START]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[START]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP14]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP15]], <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP16]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 diff --git a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll @@ -21,14 +21,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP3]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> [[WIDE_LOAD2]] ; CHECK-NEXT: store <4 x i32> [[PREDPHI]], ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000 diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -6,29 +6,29 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], -; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> , <2 x i32> [[VEC_IND]] -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> , <2 x i32> [[PREDPHI]] -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP3]], <2 x i32> [[PREDPHI6]] +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP0]], <2 x i1> [[BROADCAST_SPLAT2]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP0]], <2 x i1> [[TMP3]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> [[VEC_IND]] +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> , <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP7]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 @@ -59,7 +59,7 @@ ; CHECK-NEXT: [[P_2]] = phi i32 [ [[V_2]], [[LOOP_HEADER]] ], [ [[V_2_ADD]], [[BODY_1]] ], [ [[ADD_2]], [[BODY_2]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], 181 -; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[E_1:%.*]] = phi i32 [ [[P_1]], [[LOOP_LATCH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[E_2:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -1354,13 +1354,13 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], ; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], ; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], ; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> , <4 x i1> [[TMP11]] ; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> , <4 x float> [[PREDPHI_V]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -689,13 +689,13 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], ; CHECK-NEXT: [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], -; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], ; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], ; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> , <4 x i1> [[TMP9]] ; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP11]], <4 x float> , <4 x float> [[PREDPHI_V]] @@ -1315,6 +1315,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP13]], i64 2 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i64 3 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP19]], ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP19]], i64 0 ; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1357,7 +1358,6 @@ ; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP42]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <4 x i32> [[TMP43]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP44]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP19]], ; CHECK-NEXT: [[TMP47:%.*]] = or <4 x i1> [[TMP45]], [[TMP46]] ; CHECK-NEXT: [[TMP48:%.*]] = bitcast <4 x i1> [[TMP47]] to i4 ; CHECK-NEXT: [[TMP49:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP48]]), !range [[RNG42:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll --- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll @@ -1,116 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) { -; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF2IC1-LABEL: define i32 @pred_select_const_i32_from_icmp +; CHECK-VF2IC1-SAME: (ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF2IC1-NEXT: entry: +; CHECK-VF2IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF2IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF2IC1: vector.ph: +; CHECK-VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF2IC1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF2IC1: vector.body: -; CHECK-VF2IC1: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ] -; CHECK-VF2IC1: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], -; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue +; CHECK-VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-VF2IC1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-VF2IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] +; CHECK-VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-VF2IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], +; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-VF2IC1: pred.load.if: -; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}} +; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]] ; CHECK-VF2IC1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-VF2IC1-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; CHECK-VF2IC1-NEXT: br label %pred.load.continue +; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-VF2IC1: pred.load.continue: -; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ] -; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2 +; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK-VF2IC1: pred.load.if1: -; CHECK-VF2IC1: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}} +; CHECK-VF2IC1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF2IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP11]] ; CHECK-VF2IC1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 ; CHECK-VF2IC1-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1 -; CHECK-VF2IC1-NEXT: br label %pred.load.continue2 +; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-VF2IC1: pred.load.continue2: -; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ] +; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] ; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], ; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> , <2 x i32> [[VEC_PHI]] -; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], -; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]] -; CHECK-VF2IC1: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]] +; CHECK-VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF2IC1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF2IC1: middle.block: ; CHECK-VF2IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer -; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) -; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0 +; CHECK-VF2IC1-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 1, i32 0 +; CHECK-VF2IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF2IC1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF2IC1: scalar.ph: -; CHECK-VF2IC1: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] -; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] -; CHECK-VF2IC1-NEXT: br label %for.body +; CHECK-VF2IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF2IC1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF2IC1: for.body: -; CHECK-VF2IC1: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] -; CHECK-VF2IC1: [[TMP21:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35 -; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label %if.then, label %for.inc +; CHECK-VF2IC1-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-VF2IC1-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-VF2IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP20]], 35 +; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-VF2IC1: if.then: -; CHECK-VF2IC1: [[TMP22:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2 +; CHECK-VF2IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-VF2IC1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP21]], 2 ; CHECK-VF2IC1-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] -; CHECK-VF2IC1-NEXT: br label %for.inc +; CHECK-VF2IC1-NEXT: br label [[FOR_INC]] ; CHECK-VF2IC1: for.inc: -; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] +; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-VF2IC1-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-VF2IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-VF2IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-VF2IC1: for.end.loopexit: -; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF2IC1-NEXT: ret i32 [[R_1_LCSSA]] ; -; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF1IC2-LABEL: define i32 @pred_select_const_i32_from_icmp +; CHECK-VF1IC2-SAME: (ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC2-NEXT: entry: +; CHECK-VF1IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC2: vector.ph: +; CHECK-VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF1IC2: vector.body: -; CHECK-VF1IC2: [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ] -; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ] -; CHECK-VF1IC2: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35 -; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35 -; CHECK-VF1IC2-NEXT: br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue +; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE3]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_LOAD_CONTINUE3]] ] +; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 35 +; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 35 +; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = xor i1 [[TMP6]], true +; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = xor i1 [[TMP7]], true +; CHECK-VF1IC2-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-VF1IC2: pred.load.if: -; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; CHECK-VF1IC2-NEXT: br label %pred.load.continue +; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-VF1IC2: pred.load.continue: -; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ] -; CHECK-VF1IC2-NEXT: br i1 [[TMP5]], label %pred.load.if2, label %pred.load.continue3 +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; CHECK-VF1IC2-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]] ; CHECK-VF1IC2: pred.load.if2: -; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-VF1IC2-NEXT: br label %pred.load.continue3 +; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE3]] ; CHECK-VF1IC2: pred.load.continue3: -; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if2 ] -; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2 -; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2 -; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]] -; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]] -; CHECK-VF1IC2-NEXT: [[TMP16:%.*]] = xor i1 [[TMP4]], true -; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = xor i1 [[TMP5]], true -; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]] -; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]] -; CHECK-VF1IC2: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF2]] ] +; CHECK-VF1IC2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP12]], 2 +; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 2 +; CHECK-VF1IC2-NEXT: [[TMP18:%.*]] = select i1 [[TMP16]], i32 1, i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i32 1, i32 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP6]], i32 [[TMP18]], i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[PREDPHI4]] = select i1 [[TMP7]], i32 [[TMP19]], i32 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1IC2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC2-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF1IC2: middle.block: ; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0 -; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]] -; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI4]] +; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF1IC2: scalar.ph: -; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] -; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] -; CHECK-VF1IC2-NEXT: br label %for.body +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF1IC2: for.body: -; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ] -; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] -; CHECK-VF1IC2: [[TMP19:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF1IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35 -; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc +; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-VF1IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-VF1IC2-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35 +; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-VF1IC2: if.then: -; CHECK-VF1IC2: [[TMP20:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF1IC2-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2 +; CHECK-VF1IC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-VF1IC2-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-VF1IC2-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2 ; CHECK-VF1IC2-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] -; CHECK-VF1IC2-NEXT: br label %for.inc +; CHECK-VF1IC2-NEXT: br label [[FOR_INC]] ; CHECK-VF1IC2: for.inc: -; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] -; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %for.body +; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-VF1IC2-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-VF1IC2: for.end.loopexit: -; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF1IC2-NEXT: ret i32 [[R_1_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: store i16 [[RES]], ptr [[DST_PTR]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31 -; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -108,14 +108,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [32 x i16], ptr @src, i16 0, i16 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [32 x i16], ptr @src, i16 0, i16 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP10]], <2 x i16> , <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP2]] @@ -288,32 +288,32 @@ ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i16 [[TMP0]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP0]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[TMP0]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[TMP0]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP2]], -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP16:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP16]], ; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP15]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP19]], <2 x i16> , <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -1,29 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s @dst = external global [32 x i16], align 1 define void @blend_uniform_iv_trunc(i1 %c) { -; CHECK-LABEL: @blend_uniform_iv_trunc( +; CHECK-LABEL: define void @blend_uniform_iv_trunc +; CHECK-SAME: (i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i64 0 -; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer - +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = add i16 [[TMP1]], 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[MASK1]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP6]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.next: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ undef, [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ] +; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[BLEND]] +; CHECK-NEXT: store i16 0, ptr [[DST_PTR]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31 +; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; + entry: br label %loop.header @@ -48,26 +74,50 @@ } define void @blend_uniform_iv(i1 %c) { -; CHECK-LABEL: @blend_uniform_iv( +; CHECK-LABEL: define void @blend_uniform_iv +; CHECK-SAME: (i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i64 0 -; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer - +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[MASK1]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP6]], label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.next: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_NEXT]] ] +; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]] +; CHECK-NEXT: store i16 0, ptr [[DST_PTR]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31 +; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; + entry: br label %loop.header @@ -91,38 +141,67 @@ } define void @blend_chain_iv(i1 %c) { -; CHECK-LABEL: @blend_chain_iv( +; CHECK-LABEL: define void @blend_chain_iv +; CHECK-SAME: (i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i64 0 -; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer - +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[MASK1]], -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef -; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[MASK1]], -; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP8]], <4 x i64> [[PREDPHI]], <4 x i64> undef -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> undef +; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i1> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[PREDPHI]], <4 x i64> undef +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP15]] +; CHECK-NEXT: store i16 0, ptr [[TMP6]], align 2 +; CHECK-NEXT: store i16 0, ptr [[TMP8]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP10]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP12]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP14]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP16]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.next: +; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT_2:%.*]], label [[LOOP_NEXT_3:%.*]] +; CHECK: loop.next.2: +; CHECK-NEXT: br label [[LOOP_NEXT_3]] +; CHECK: loop.next.3: +; CHECK-NEXT: [[BLEND_1:%.*]] = phi i64 [ undef, [[LOOP_NEXT]] ], [ [[IV]], [[LOOP_NEXT_2]] ] +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[BLEND_1]], [[LOOP_NEXT_3]] ] +; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]] +; CHECK-NEXT: store i16 0, ptr [[DST_PTR]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31 +; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; + entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -215,6 +215,7 @@ ; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next, ir<1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%i>, ir<5> +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%cmp> ; CHECK-NEXT: Successor(s): pred.udiv ; CHECK-EMPTY: ; CHECK-NEXT: pred.udiv: { @@ -233,7 +234,6 @@ ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%cmp> ; CHECK-NEXT: BLEND %d = ir<0>/vp<[[NOT]]> vp<[[PRED]]>/ir<%cmp> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> @@ -410,9 +410,11 @@ ; CHECK-NEXT: WIDEN ir<%lsd> = load ir<%isd> ; CHECK-NEXT: WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23> ; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100> -; CHECK-NEXT: WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200> ; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%cmp1>, !dbg /tmp/s.c:5:3 +; CHECK-NEXT: WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200> ; CHECK-NEXT: EMIT vp<[[SEL1:%.+]]> = select vp<[[NOT1]]> ir<%cmp2> ir, !dbg /tmp/s.c:5:21 +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2> +; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = select vp<[[NOT1]]> vp<[[NOT2]]> ir ; CHECK-NEXT: EMIT vp<[[OR1:%.+]]> = or vp<[[SEL1]]> ir<%cmp1> ; CHECK-NEXT: Successor(s): pred.sdiv ; CHECK-EMPTY: @@ -432,8 +434,6 @@ ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2> -; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = select vp<[[NOT1]]> vp<[[NOT2]]> ir ; CHECK-NEXT: BLEND %ysd.0 = vp<[[PHI]]>/vp<[[OR1]]> ir<%psd>/vp<[[SEL2]]> ; CHECK-NEXT: WIDEN store ir<%isd>, ir<%ysd.0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> @@ -694,6 +694,7 @@ ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<%2> ; CHECK-NEXT: WIDEN ir<%ld.value> = load ir<%ld.addr> ; CHECK-NEXT: WIDEN ir<%ifcond> = fcmp oeq ir<%ld.value>, ir<5.000000e+00> +; CHECK-NEXT: EMIT vp<%6> = not ir<%ifcond> ; CHECK-NEXT: Successor(s): pred.call ; CHECK-EMPTY: ; CHECK-NEXT: pred.call: { @@ -707,16 +708,15 @@ ; CHECK-NEXT: Successor(s): pred.call.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.call.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%foo.ret.1> -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%foo.ret.2> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%foo.ret.1> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%10> = ir<%foo.ret.2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): if.then.1 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.1: -; CHECK-NEXT: WIDEN ir<%fadd> = fadd vp<%8>, vp<%9> -; CHECK-NEXT: EMIT vp<%11> = not ir<%ifcond> -; CHECK-NEXT: BLEND %st.value = ir<%ld.value>/vp<%11> ir<%fadd>/ir<%ifcond> +; CHECK-NEXT: WIDEN ir<%fadd> = fadd vp<%9>, vp<%10> +; CHECK-NEXT: BLEND %st.value = ir<%ld.value>/vp<%6> ir<%fadd>/ir<%ifcond> ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<%2> ; CHECK-NEXT: WIDEN store ir<%st.addr>, ir<%st.value> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -336,6 +336,8 @@ ; CHECK-NEXT: WIDEN ir<%c.1> = icmp ult ir<%iv>, ir<%j> ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> ; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1]]> ir<%c.1> ir +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.1> +; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: ; CHECK-NEXT: pred.load: { @@ -356,8 +358,6 @@ ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.1> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: BLEND %p = ir<0>/vp<[[MASK3]]> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> ; CHECK-NEXT: Successor(s): pred.store @@ -436,6 +436,8 @@ ; CHECK-NEXT: WIDEN ir<%c.0> = icmp ult ir<%iv>, ir<%j> ; CHECK-NEXT: WIDEN ir<%c.1> = icmp ugt ir<%iv>, ir<%j> ; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1]]> ir<%c.0> ir +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> +; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: ; CHECK-NEXT: pred.load: { @@ -456,8 +458,6 @@ ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: BLEND %p = ir<0>/vp<[[MASK3]]> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> ; CHECK-NEXT: EMIT vp<[[MASK4:%.+]]> = select vp<[[OR]]> ir<%c.1> ir @@ -543,6 +543,8 @@ ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> ; CHECK-NEXT: WIDEN ir<%c.0> = icmp ult ir<%iv>, ir<%j> ; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1:%.+]]> ir<%c.0> ir +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> +; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: ; CHECK-NEXT: pred.load: { @@ -563,8 +565,6 @@ ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir ; CHECK-NEXT: BLEND %p = ir<0>/vp<[[MASK3]]> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: EMIT vp<[[MASK4:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> ; CHECK-NEXT: EMIT vp<[[MASK5:%.+]]> = select vp<[[MASK4]]> ir<%c.0> ir @@ -675,8 +675,8 @@ ; CHECK-EMPTY: ; CHECK-NEXT: loop.3: ; CHECK-NEXT: WIDEN ir<%c.0> = icmp ult ir<%iv>, ir<%j> -; CHECK-NEXT: WIDEN ir<%mul> = mul vp<[[PRED1]]>, vp<[[PRED2]]> ; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK]]> ir<%c.0> ir +; CHECK-NEXT: WIDEN ir<%mul> = mul vp<[[PRED1]]>, vp<[[PRED2]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: {